In [1]:
import pandas as pd


In [2]:
df = pd.DataFrame()
df['name'] = ['Uri','Oran','Uri','Izhak',"Moshe"]
df['City'] = ['Haifa','TLV','TLV','Natanya',"Haifa"]
df

Unnamed: 0,name,City
0,Uri,Haifa
1,Oran,TLV
2,Uri,TLV
3,Izhak,Natanya
4,Moshe,Haifa


In [3]:
df['name']

0      Uri
1     Oran
2      Uri
3    Izhak
4    Moshe
Name: name, dtype: object

In [4]:
df['name'].apply(lambda x: x == 'Uri')

0     True
1    False
2     True
3    False
4    False
Name: name, dtype: bool

In [5]:
df.name.unique()

array(['Uri', 'Oran', 'Izhak', 'Moshe'], dtype=object)

In [6]:
df.name.nunique()

4

In [7]:
df.nunique()

name    4
City    3
dtype: int64

In [8]:
df.name.value_counts()

Uri      2
Oran     1
Moshe    1
Izhak    1
Name: name, dtype: int64

Reminder of dataframe:

In [9]:
df = pd.DataFrame()
df['name'] = ['Uri','Oran','Uri','Izhak',"Moshe"]
df['City'] = ['Haifa','TLV','TLV','Natanya',"Haifa"]
df

Unnamed: 0,name,City
0,Uri,Haifa
1,Oran,TLV
2,Uri,TLV
3,Izhak,Natanya
4,Moshe,Haifa


# Manual onehot creation

In [10]:
df['Is_TLV'] = (df.City == "TLV").astype(int)
df['Is_Haifa'] = (df.City == "Haifa").astype(int)
df['Is_Natanya'] = (df.City == 'Natanya').astype(int)
df

Unnamed: 0,name,City,Is_TLV,Is_Haifa,Is_Natanya
0,Uri,Haifa,0,1,0
1,Oran,TLV,1,0,0
2,Uri,TLV,1,0,0
3,Izhak,Natanya,0,0,1
4,Moshe,Haifa,0,1,0


Drop original City column

In [11]:
df = df.drop(columns='City')
df

Unnamed: 0,name,Is_TLV,Is_Haifa,Is_Natanya
0,Uri,0,1,0
1,Oran,1,0,0
2,Uri,1,0,0
3,Izhak,0,0,1
4,Moshe,0,1,0


# Another way for onehot (based on stackoverflow answer)

In [12]:
df = pd.DataFrame()
df['name'] = ['Uri','Oran','Uri','Izhak',"Moshe"]
df['City'] = ['Haifa','TLV','TLV','Natanya',"Haifa"]
df

Unnamed: 0,name,City
0,Uri,Haifa
1,Oran,TLV
2,Uri,TLV
3,Izhak,Natanya
4,Moshe,Haifa


In [13]:
one_hot = pd.get_dummies(df['City'])
one_hot

Unnamed: 0,Haifa,Natanya,TLV
0,1,0,0
1,0,0,1
2,0,0,1
3,0,1,0
4,1,0,0


In [14]:
df

Unnamed: 0,name,City
0,Uri,Haifa
1,Oran,TLV
2,Uri,TLV
3,Izhak,Natanya
4,Moshe,Haifa


In [15]:
df = df.drop('City',axis = 1)
df

Unnamed: 0,name
0,Uri
1,Oran
2,Uri
3,Izhak
4,Moshe


In [16]:
df = df.join(one_hot)
df

Unnamed: 0,name,Haifa,Natanya,TLV
0,Uri,1,0,0
1,Oran,0,0,1
2,Uri,0,0,1
3,Izhak,0,1,0
4,Moshe,1,0,0


# Anoter way to do it

In [17]:
df = pd.DataFrame()
df['name'] = ['Uri','Oran','Uri','Izhak',"Moshe"]
df['City'] = ['Haifa','TLV','TLV','Natanya',"Haifa"]
df

Unnamed: 0,name,City
0,Uri,Haifa
1,Oran,TLV
2,Uri,TLV
3,Izhak,Natanya
4,Moshe,Haifa


In [18]:
df['Is_Haifa'] = df.City.apply(lambda x: x == 'Haifa')
df['Is_TLV'] = df.City.apply(lambda x: x == 'TLV')
df['Is_Natanya'] = df.City.apply(lambda x: x == 'Natanya')
df

Unnamed: 0,name,City,Is_Haifa,Is_TLV,Is_Natanya
0,Uri,Haifa,True,False,False
1,Oran,TLV,False,True,False
2,Uri,TLV,False,True,False
3,Izhak,Natanya,False,False,True
4,Moshe,Haifa,True,False,False


In [19]:
df = df.drop(columns='City')
df

Unnamed: 0,name,Is_Haifa,Is_TLV,Is_Natanya
0,Uri,True,False,False
1,Oran,False,True,False
2,Uri,False,True,False
3,Izhak,False,False,True
4,Moshe,True,False,False


# Another more efficient way

In [20]:
df = pd.DataFrame()
df['name'] = ['Uri','Oran','Uri','Izhak',"Moshe"]
df['City'] = ['Haifa','TLV','TLV','Natanya',"Haifa"]
df

Unnamed: 0,name,City
0,Uri,Haifa
1,Oran,TLV
2,Uri,TLV
3,Izhak,Natanya
4,Moshe,Haifa


In [21]:
df= pd.get_dummies(df, prefix='Is_', columns=['City'])
df

Unnamed: 0,name,Is__Haifa,Is__Natanya,Is__TLV
0,Uri,1,0,0
1,Oran,0,0,1
2,Uri,0,0,1
3,Izhak,0,1,0
4,Moshe,1,0,0


# From lesson

In [24]:
df

Unnamed: 0,name,Is__Haifa,Is__Natanya,Is__TLV
0,Uri,1,0,0
1,Oran,0,0,1
2,Uri,0,0,1
3,Izhak,0,1,0
4,Moshe,1,0,0


In [27]:
df[[c for i,c in enumerate(df.columns) if i%2 == 0]]

Unnamed: 0,name,Is__Natanya
0,Uri,0
1,Oran,0
2,Uri,0
3,Izhak,1
4,Moshe,0
