# Preprocessing  
In this file I prepare the data for exploration by cleaning missing values, simplifying catagories, and dropping anything unecessary.

In [1]:
import pandas as pd 
import numpy as np 
import warnings
warnings.filterwarnings("ignore")

This dataset includes essay columns which whill be excluded from this project since they are outside the scope of what my goals are for this project. 

In [2]:
data = pd.read_csv('okcupid_profiles.csv')   
kc = ['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'ethnicity', 
      'height', 'income', 'job','last_online', 'location', 'offspring', 'pets', 'religion', 'sign','smokes', 'speaks'] 
data = data[kc]

In [3]:
data.head(3)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,income,job,last_online,location,offspring,pets,religion,sign,smokes,speaks
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,-1,transportation,2012-06-28-20-30,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,sometimes,english
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,80000,hospitality / travel,2012-06-29-21-41,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,no,"english (fluently), spanish (poorly), french (..."
2,38,available,m,straight,thin,anything,socially,,graduated from masters program,,...,-1,,2012-06-27-09-10,"san francisco, california",,has cats,,pisces but it doesn&rsquo;t matter,no,"english, french, c++"


## Dealing with the Target Variable
Here I begin to look at each individual column and start making decisions about what to keep or simplify. Starting with my target variable. Because the is going to be my target variable, I will start by removing the categories I will not be including using and creating a new column that aligns with my question by turning the job category into a binary. The column will answer whether or not the observation work in science/tech/computers or not.

In [4]:
#Here we narrow down the observations to those tha can actually be identified are not ambiguos such as student, retired, etc.
jobs_being_used = ['transportation', 'hospitality / travel', 'artistic / musical / writer', 'computer / hardware / software',
       'banking / financial / real estate', 'entertainment / media','sales / marketing / biz dev','medicine / health',
       'science / tech / engineering', 'executive / management','education / academia', 'clerical / administrative',
       'construction / craftsmanship', 'political / government', 'law / legal services','military'] 
df = data[data['job'].isin(jobs_being_used)]

In [5]:
df.job.value_counts() #Checks the value counts for each of the job catagories   

science / tech / engineering         4848
computer / hardware / software       4709
artistic / musical / writer          4439
sales / marketing / biz dev          4391
medicine / health                    3680
education / academia                 3513
executive / management               2373
banking / financial / real estate    2266
entertainment / media                2250
law / legal services                 1381
hospitality / travel                 1364
construction / craftsmanship         1021
clerical / administrative             805
political / government                708
transportation                        366
military                              204
Name: job, dtype: int64

In [6]:
#Creating another column that identies whether an observation belongs to my target class or not
df['works_in_tech'] = np.nan 
df.job.replace(['computer / hardware / software','science / tech / engineering'
                     ],'science/tech/computers', inplace=True)  
def tech_column(row): 
    if row['job'] ==  'science/tech/computers': 
        row['works_in_tech'] = 1 
    else: 
        row['works_in_tech'] = 0  
    return row 

df = df.apply(tech_column, axis = 1) 

In [7]:
df['works_in_tech'].value_counts() #checks to values counts for my newly created column

0    28761
1     9557
Name: works_in_tech, dtype: int64

## Tranforming the Features
Having dealt with the target, I can move on to tranforming every other column, deciding whether or not to use it, and dealing with missing values. 

In [33]:
df.isna().sum()

age                     0
sex                     0
orientation             0
body_type               0
diet                    0
drinks                  0
drugs                   0
education               0
ethnicity               0
height                  0
job                     0
location                0
offspring               0
pets                    0
religion                0
sign                    0
smokes                  0
speaks                  0
works_in_tech           0
above_average_height    0
dtype: int64

### Age Column 
The age column contains no missing values and has multiple values in each age and so will be left as is. 

In [9]:
df.age.describe() #Summerizes the age column

count    38318.000000
mean        33.368313
std          9.250639
min         18.000000
25%         27.000000
50%         31.000000
75%         38.000000
max         69.000000
Name: age, dtype: float64

### Status Column 
Because this data is coming from a dating site this is unlikely to representative and so will be dropped.  

In [10]:
del df['status']

### Sex Column 
This columns contains no missing values, but will be renames for clarity. 

In [11]:
df['sex'] = df['sex'].replace(['m','f'],['male','female']) 
df.sex.value_counts() 

male      24042
female    14276
Name: sex, dtype: int64

### Orientation Column 
This column is highly imbalanaced but will be left as is. 

In [12]:
df.orientation.value_counts() #Returns the catagory 

straight    33558
gay          3325
bisexual     1435
Name: orientation, dtype: int64

### Body Type Column 
This column has been reconfigured to group up similar descriptions. I have also changed the descriptions of "jacked" and "used up" as they are descriptors with ambiguous meaning and changed nan and 'rather not say' to unknown. 

In [13]:
df['body_type'] = df['body_type'].replace(['fit','athletic','thin','skinny','curvy','a little extra', 'full figured', 'overweight',
                                          'jacked','used up', 'rather not say','average'],
                                          ['body_fit/athletic','body_fit/athletic','body_thin/skinny','body_thin/skinny', 
                                           'body_curvy/full_figured', 'body_curvy/full_figured','body_curvy/full_figured',
                                           'body_curvy/full_figured', 'body_average', 'body_average', 'body_average','body_average'])#This code regroups and renames observations
df['body_type'] = df['body_type'].fillna('unknown_body_type') #This gives a value to missing values
df.body_type.value_counts() #Checks the counts of this column

body_fit/athletic          16835
body_average                9784
body_curvy/full_figured     4882
body_thin/skinny            3924
unknown_body_type           2893
Name: body_type, dtype: int64

### Diet Column 
#This column had a large number of missing values. I chose to include them in the anything catagory for the sake retaining as much 
#data as possible but also with the assumption that if wasn't important enough to fill in it is unlikely to be too relevant to the 
#observations life.

In [14]:
df['diet'] = df['diet'].replace(['strictly anything', 'mostly other', 'mostly anything', 'mostly vegetarian', 'strictly vegan', 
                                  'vegetarian','mostly halal', 'strictly vegetarian', 'strictly other', 'mostly kosher',
                                 'mostly vegan', 'vegan', 'strictly kosher', 'kosher', 'strictly halal', 'halal','other','anything'],
                                          ['diet_anything','diet_other','diet_anything','diet_vegan/vegetarian', 'diet_vegan/vegetarian', 'diet_vegan/vegetarian',
                                          'diet_other','diet_vegan/vegetarian', 'diet_other', 'diet_other', 'diet_vegan/vegetarian', 'diet_vegan/vegetarian','diet_other',
                                          'diet_other','diet_other','diet_other','diet_other','diet_anything'])#This code regroups and renames observations
df['diet'] = df['diet'].fillna('unkown_diet') #This gives a value to missing values
df.diet.value_counts() #Checks the counts of this column

diet_anything            18751
unkown_diet              14760
diet_vegan/vegetarian     3563
diet_other                1244
Name: diet, dtype: int64

### Drinks Column 
This will be simplified through regrouping and have missing values filled in. 

In [15]:
df['drinks'] = df['drinks'].replace(['socially', 'often', 'not at all', 'rarely','very often','desperately'], #renames and regroups column
                                    ['drinks', 'drinks','drinks_rarely/not_at_all','drinks_rarely/not_at_all','drinks','drinks' ])
df['drinks'] = df['drinks'].fillna('unknown_if_drinker') #fills in missing values
df.drinks.value_counts() #checks value counts

drinks                      31560
drinks_rarely/not_at_all     5566
unknown_if_drinker           1192
Name: drinks, dtype: int64

### Drugs Column 
This will be simplified through regrouping and have missing values filled in.

In [16]:
df['drugs'] = df['drugs'].replace(['never', 'sometimes', 'often'],
                                ['no_drugs', 'drugs','drugs'])#renames and regroups column
df['drugs'] = df['drugs'].fillna('unknown_if_drugs') #fills in missing values
df.drugs.value_counts() #checks value counts

no_drugs            24457
unknown_if_drugs     8987
drugs                4874
Name: drugs, dtype: int64

### Education Column 
This will be simplified through regrouping and have missing values filled in as well as changing any "space camp" responses to unknown since it seems unlikely that many people went to space camp.

In [17]:
df.education.replace(['graduated from college/university','working on masters program','graduated from masters program', 
                      'college/university','graduated from law school', 'graduated from two-year college','working on med school',
                       'graduated from ph.d program','graduated from med school', 'two-year college', 'working on ph.d program',
                       'dropped out of ph.d program','dropped out of med school','working on law school',
                      'dropped out of masters program', 'masters program', 'ph.d program', 'law school',
                      'dropped out of law school', 'med school'], 'college_graduate_or_higher', inplace=True) #renames and regroups column

df.education.replace(['working on college/university','dropped out of college/university','dropped out of high school',
                      'dropped out of two-year college', 'working on two-year college','high school', 'graduated from high school',
                      'working on high school'],'no_college_degree', inplace=True)  #renames and regroups column

df.education.replace(['working on space camp', 'graduated from space camp', 'dropped out of space camp',
                     'dropped out of space camp','space camp'
                     ],'unknown_education', inplace=True)   #renames and regroups column
df['education'] = df['education'].fillna('unknown_education') #fills in missing values
df.education.value_counts() #checks value counts

college_graduate_or_higher    30337
no_college_degree              4302
unknown_education              3679
Name: education, dtype: int64

### Ethnicity Column 
This column included multiple versions of mixed ethnicity so in an attempt somewhat even out the the minority groups I regrouped them by ones containing the names of some of the majority groups. The rest will be grouped as other.

In [18]:
df['ethnicity'] = df['ethnicity'].fillna('ethnicity_other/unknown') #fills in missing values
df.loc[df['ethnicity'].str.contains('indian'), 'ethnicity'] = 'indian' #renames and regroups all containing this ethnicity
df.loc[df['ethnicity'].str.contains('middle eastern'), 'ethnicity'] = 'middle_eastern' #renames and regroups all containing this ethnicity
df.loc[df['ethnicity'].str.contains('black'), 'ethnicity'] = 'black' #renames and regroups all containing this ethnicity
df.loc[df['ethnicity'].str.contains('hispanic'), 'ethnicity'] = 'hispanic/latin' #renames and regroups all containing this ethnicity
df.loc[df['ethnicity'].str.contains('asian'), 'ethnicity'] = 'asian' #renames and regroups all containing this ethnicity
df.ethnicity.replace(['pacific islander, white','pacific islander', 'native american, white', 'native american, white, other', 
                      'white, other','other', 'native american', 'pacific islander, other', 'pacific islander, white, other',
                      'native american, other', 'native american, pacific islander, white', 
                      'native american, pacific islander, white, other', 'native american, pacific islander'],'ethnicity_other/unknown', inplace=True) #renames and regroups all containing this ethnicity 
df.ethnicity.value_counts() #checks value counts

white                      22056
ethnicity_other/unknown     5112
asian                       4971
hispanic/latin              2833
black                       1797
indian                      1055
middle_eastern               494
Name: ethnicity, dtype: int64

### Height Column 
The column will be capped to reasonable nubers as descibed by this website: https://dqydj.com/height-percentile-calculator-for-men-and-women/.
According to this less than 1 percent of women are below 4'8" and less than 1 percent of men are taller than 6'3" with an std of about three inches inbetween. I will also create a new column that answers whether the observation is above average height.

In [19]:
def cap_height(row): #Is function that caps the height column
    if row['height'] > 75.0:
        row['height'] = 79.0 #column average
    if row['height'] < 52:
        row['height'] = 56
    return row  
df = df.apply(cap_height, axis = 1)        #applies the above function to the height column 
df = df[df['height'].notna()]   #drops the 1 missing value column

In [20]:
df.height.describe() #checks value counts

count    38317.000000
mean        68.531931
std          3.948027
min         52.000000
25%         66.000000
50%         69.000000
75%         71.000000
max         79.000000
Name: height, dtype: float64

In [21]:
 df['above_average_height'] = np.nan #Creates an empty function
def aah_column(row): #Creates a function that populates the height column
    male_avg = 69.2
    female_avg = 63.6
    if row['sex'] == 'm':
        if row['height'] > male_avg: 
            row['above_average_height'] = 'above_average_height' 
        else: 
            row['above_average_height'] = 'not_above_average_height' 
    else: 
        if row['height'] > female_avg: 
            row['above_average_height'] = 'above_average_height'  
        else: 
            row['above_average_height'] = 'not_above_average_height'
    return row 

df = df.apply(aah_column, axis = 1)#applies the above function to the height column

In [22]:
df.above_average_height.value_counts()

above_average_height        34106
not_above_average_height     4211
Name: above_average_height, dtype: int64

### Income Column 
#Column will be dropped due to most not having answered this question and being that it is from a dating site, the chances of users inflating this number in my opinion is high.

In [23]:
del df['income'] #drops the income column

### Last Online Column 
#I dont think column serves any purpose for my model especially given the limited time frame of the data collectionso it will be dropped.

In [24]:
del df['last_online'] #drops the last online column

### Location Column 
#The data for this data set was collected in the city of sanfrancisco so most observations are in that city. For the sake of simplicity I will be turning this column into a binary depending on simply whether or not the observation is in sanfrancisco. Additionally, I wonder if the column will even be worht keeping given that "not san fran" can include so many other places. 

In [25]:
df['location'] = df['location'].fillna('not_san_francisco') #fills missing values
df.location.replace(['san francisco, california', 'south san francisco, california'],  'san_francisco_ca', inplace=True) 
#df['location'] = np.where(df['location'] in ['san francisco, california', 'south san francisco, california'], 'san_francisco_ca', 'not_san_francisco')
df.location.replace(['oakland, california', 'daly city, california','atherton, california', 
                     'san leandro, california', 'san rafael, california', 'walnut creek, california',  'berkeley, california', 
                     'belmont, california','san jose, california', 'palo alto, california', 'emeryville, california', 
                     'el granada, california', 'castro valley, california', 'fairfax, california','mountain view, california', 
                     'menlo park, california', 'burlingame, california', 'alameda, california','benicia, california', 
                     'mill valley, california', 'san mateo, california', 'redwood city, california', 'el cerrito, california', 
                     'stanford, california', 'san pablo, california', 'lafayette, california', 'fremont, california', 
                     'orinda, california', 'novato, california', 'vallejo, california', 'san lorenzo, california','san carlos, california', 'pacifica, california',
                     'hayward, california', 'foster city, california','hercules, california', 'bolinas, california',
                     'larkspur, california', 'moraga, california', 'albany, california','martinez, california', 'san bruno, california',
                     'millbrae, california', 'el sobrante, california','richmond, california', 'petaluma, california','pinole, california', 'pleasant hill, california',
                     'san geronimo, california', 'san anselmo, california','sausalito, california', 'crockett, california',
                    'boulder, colorado', 'half moon bay, california','belvedere tiburon, california', 'montara, california', 'corte madera, california', 'new york, new york',
                     'green brae, california', 'ross, california', 'east palo alto, california', 'brisbane, california',
                     'hacienda heights, california', 'point richmond, california','sacramento, california', 'rodeo, california', 'portland, oregon',
                       'tucson, arizona', 'honolulu, hawaii', 'billings, montana','west oakland, california', 'glencove, california',
                       'tiburon, california', 'peoria, illinois', 'bellwood, illinois',
                       'nha trang, vietnam', 'hillsborough, california','moss beach, california', 'kensington, california', 'kentfield, california', 'redwood shores, california',
                       'woodside, california', 'lagunitas, california','studio city, california', 'concord, california','forest knolls, california', 'edinburgh, united kingdom',
                       'london, united kingdom', 'chicago, illinois', 'colma, california','los angeles, california', 'south wellfleet, massachusetts', 'piedmont, california', 'los gatos, california', 'boise, idaho',
                       'islip terrace, new york', 'sunnyvale, california', 'cambridge, massachusetts', 'ozone park, new york','jackson, mississippi', 'south orange, new jersey','atlanta, georgia', 'madrid, spain', 'port costa, california','nicasio, california', 'bellingham, washington',
                       'woodacre, california', 'boston, massachusetts','longwood, florida', 'westlake, california','granite bay, california', 'campbell, california',
                       'santa ana, california', 'santa rosa, california', 'nevada city, california', 'providence, rhode island','stockton, california', 'vancouver, british columbia, canada','pacheco, california', 'irvine, california',
                       'kansas city, missouri', 'kassel, germany','stinson beach, california', 'philadelphia, pennsylvania', 'amsterdam, netherlands', 'napa, california', 'austin, texas','brooklyn, new york', 'bonaduz, switzerland',
                       'salt lake city, utah', 'bayshore, california', 'south lake tahoe, california', 'vacaville, california', 'miami, florida', 'long beach, california', 'cincinnati, ohio','phoenix, arizona', 'rochester, michigan', 'santa cruz, california',
                        'freedom, california', 'san quentin, california','utica, michigan', 'seaside, california', 'santa monica, california', 'woodbridge, virginia',
                        'arcadia, california', 'san antonio, texas', 'kula, hawaii','washington, district of columbia', 'muir beach, california','canyon, california', 'minneapolis, minnesota','san diego, california'], 
                    'not_san_francisco', inplace=True)  #renames and regroups column 
df.location.value_counts() #checks value counts

san_francisco_ca     21067
not_san_francisco    17250
Name: location, dtype: int64

### Offspring Column 
#This column has many missing values, approximately 60% which will likely end up being the largest group here. For the rest they will simplified to whether or not they have kids. 

In [26]:
df['offspring'] = df['offspring'].fillna('unknown_if_kids') #fills missing values
df.offspring.replace(['has a kid', 'has kids', "has kids, but doesn't want more", "has a kid, but doesn't want more",
                      'has kids, and might want more', 'has a kid, and might want more', 'has a kid, and wants more', 
                      'has kids, and wants more'
                     ],'has_kids', inplace=True) #renames and regroups column 
df.offspring.replace(["doesn't have kids, but might want them","doesn't have kids, but wants them", "doesn't have kids",
                      'might want kids', "doesn't have kids, and doesn't want any", "doesn't want kids",'wants kids',
                     ],'no_kids', inplace=True) #renames and regroups column 
df.offspring.value_counts() #checks value counts

unknown_if_kids    21721
no_kids            13069
has_kids            3527
Name: offspring, dtype: int64

### Pets Column 
This column contained many attitudes but it has been simplified to just wether observations like both cats and dogs, likes cats or dogs, and like neither. 

In [27]:
df['pets'] = df['pets'].fillna('unknown_pet_feelings') #fills missing values
df.pets.replace(['likes dogs and likes cats', 'likes dogs and has cats','has dogs and likes cats','has dogs and has cats', 
                     ],'likes_dogs_and_cats', inplace=True) #renames and regroups column
df.pets.replace(['likes dogs and dislikes cats','has dogs','likes dogs','has dogs and dislikes cats',
                     ],'likes_dogs', inplace=True) #renames and regroups column
df.pets.replace(['likes cats','has cats','dislikes dogs and likes cats', 'dislikes dogs and has cats'
                     ],'likes_cats', inplace=True) #renames and regroups column
df.pets.replace(['dislikes cats','dislikes dogs and dislikes cats','dislikes dogs'
                     ],'dislikes_cats_and_or_dogs', inplace=True) #renames and regroups column
df.pets.value_counts() #checks value counts

likes_dogs_and_cats          15161
unknown_pet_feelings         11502
likes_dogs                    9538
likes_cats                    1875
dislikes_cats_and_or_dogs      241
Name: pets, dtype: int64

### Religion Column 
This column has been simplified to remove attitudes about religion as well as fill in missing values.

In [28]:
df['religion'] = df['religion'].fillna('religion_other/unknown') #fills missing values
df.loc[df['religion'].str.contains('agnosticism'), 'religion'] = 'agnosticism' #renames and regroups column
df.loc[df['religion'].str.contains('catholicism'), 'religion'] = 'catholicism' #renames and regroups column
df.loc[df['religion'].str.contains('christianity'), 'religion'] = 'christianity' #renames and regroups column
df.loc[df['religion'].str.contains('atheism'), 'religion'] = 'atheism' #renames and regroups column
df.loc[df['religion'].str.contains('judaism'), 'religion'] = 'judaism' #renames and regroups column
df.loc[df['religion'].str.contains('buddhism'), 'religion'] = 'buddhism' #renames and regroups column
df.loc[df['religion'].str.contains('hinduism'), 'religion'] = 'hinduism' #renames and regroups column
df.loc[df['religion'].str.contains('other'), 'religion'] = 'religion_other/unknown' #renames and regroups column
df.loc[df['religion'].str.contains('islam'), 'religion'] = 'islam'  #renames and regroups column
df.religion.value_counts() #checks value counts

religion_other/unknown    16602
agnosticism                6084
atheism                    4659
christianity               3788
catholicism                3265
judaism                    2165
buddhism                   1340
hinduism                    341
islam                        73
Name: religion, dtype: int64

### Sign Column 
This column has been simplified to remove attitudes about astrology as well as fill in missing values.

In [29]:
df['sign'] = df['sign'].fillna('sign_unknown') #fills missing values
df.loc[df['sign'].str.contains('aries'), 'sign'] = 'aries' #renames and regroups column
df.loc[df['sign'].str.contains('taurus'), 'sign'] = 'taurus' #renames and regroups column
df.loc[df['sign'].str.contains('gemini'), 'sign'] = 'gemini' #renames and regroups column
df.loc[df['sign'].str.contains('cancer'), 'sign'] = 'cancer' #renames and regroups column
df.loc[df['sign'].str.contains('leo'), 'sign'] = 'leo' #renames and regroups column
df.loc[df['sign'].str.contains('virgo'), 'sign'] = 'virgo' #renames and regroups column
df.loc[df['sign'].str.contains('libra'), 'sign'] = 'libra' #renames and regroups column
df.loc[df['sign'].str.contains('scorpio'), 'sign'] = 'scorpio' #renames and regroups column
df.loc[df['sign'].str.contains('sagittarius'), 'sign'] = 'sagittarius' #renames and regroups column
df.loc[df['sign'].str.contains('capricorn'), 'sign'] = 'capricorn' #renames and regroups column
df.loc[df['sign'].str.contains('aquarius'), 'sign'] = 'aquarius' #renames and regroups column
df.loc[df['sign'].str.contains('pisces'), 'sign'] = 'pisces' #renames and regroups column
df.sign.value_counts() #checks value counts

sign_unknown    6173
gemini          2884
leo             2879
cancer          2796
libra           2772
scorpio         2762
taurus          2713
virgo           2696
aries           2608
pisces          2600
aquarius        2562
sagittarius     2534
capricorn       2338
Name: sign, dtype: int64

### Smokes Column 
This column has been simplified to remove the amount they smoke as well as fill in missing values.

In [30]:
df['smokes'] = df['smokes'].fillna('unknown_if_smokes') #fills missing values
df.smokes.replace(['no'],'does_not_smoke', inplace=True) #renames and regroups column
df.smokes.replace(['sometimes','when drinking','yes','trying to quit'],'smokes', inplace=True) #renames and regroups column
df.smokes.value_counts() #checks value counts

does_not_smoke       29396
smokes                6150
unknown_if_smokes     2771
Name: smokes, dtype: int64

### Speaks Column 
#This column will be adjusted since it includes english plus other languages. Since all but 30 observations are known and include english I decided to note if the observation speaks one of the other top 8 languages (not included english) of starting with the least popular and moving up thru most popular. 

In [32]:
df['speaks'] = df['speaks'].fillna('language_unknown') #fills missing values
df.loc[df['speaks'].str.contains('japanese'), 'speaks'] = 'speaks_japanese' #renames and regroups column
df.loc[df['speaks'].str.contains('russian'), 'speaks'] = 'speaks_russian' #renames and regroups column
df.loc[df['speaks'].str.contains('portuguese'), 'speaks'] = 'speaks_portuguese' #renames and regroups column
df.loc[df['speaks'].str.contains('bengali'), 'speaks'] = 'speaks_bengali' #renames and regroups column
df.loc[df['speaks'].str.contains('hindi'), 'speaks'] = 'speaks_hindi' #renames and regroups column
df.loc[df['speaks'].str.contains('spanish'), 'speaks'] = 'speaks_spanish' #renames and regroups column
df.loc[df['speaks'].str.contains('chinese'), 'speaks'] = 'speaks_chinese' #renames and regroups column
df.loc[df['speaks'].str.contains('english'), 'speaks'] = 'speaks_english' #renames and regroups column
df.speaks.value_counts() #checks value counts

speaks_english       23742
speaks_spanish        9334
speaks_chinese        1612
speaks_japanese       1506
speaks_russian         844
speaks_portuguese      659
speaks_hindi           536
speaks_bengali          54
language_unknown        30
Name: speaks, dtype: int64

## Saving Cleaned Data Set
#Having addressed in all the features, I will now save all of these changes as a new csv file to begin data exploration. 

In [34]:
df.to_csv('processed_data.csv')