In [1]:
## import essential libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline
pd.options.display.max_columns = 31

In [2]:
## load the data
profiles = pd.read_csv('profiles.csv')

In [3]:
## create a list of categorical columns
cat_cols = profiles.select_dtypes(exclude=np.number).columns.tolist()
profiles[cat_cols].head(2)

Unnamed: 0,body_type,diet,drinks,drugs,education,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9,ethnicity,job,last_online,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status
0,a little extra,strictly anything,socially,never,working on college/university,about me:<br />\n<br />\ni would love to think...,currently working as an international agent fo...,making people laugh.<br />\nranting about a go...,"the way i look. i am a six foot half asian, ha...","books:<br />\nabsurdistan, the republic, of mi...",food.<br />\nwater.<br />\ncell phone.<br />\n...,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet!<br />\nyou...,"asian, white",transportation,2012-06-28-20-30,"south san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism and very serious about it,m,gemini,sometimes,english,single
1,average,mostly other,often,sometimes,working on space camp,i am a chef: this is what that means.<br />\n1...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories.<br /...,,,i am very open and will share just about anyth...,,white,hospitality / travel,2012-06-29-21-41,"oakland, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single


In [4]:
## create a list of words/phrases to be replaced
words_to_replace = ['working on', 'graduated from']

## replace words with an empty string
for word in words_to_replace:
    profiles.education = profiles.education.str.replace(word, '').str.strip()

## get the first word in the each string      
profiles.sign = profiles.sign.str.split().str.get(0)

In [5]:
num_cols = profiles.select_dtypes(np.number).columns.tolist()
profiles[num_cols].describe()

Unnamed: 0,age,height,income
count,59946.0,59943.0,59946.0
mean,32.34029,68.295281,20033.222534
std,9.452779,3.994803,97346.192104
min,18.0,1.0,-1.0
25%,26.0,66.0,-1.0
50%,30.0,68.0,-1.0
75%,37.0,71.0,-1.0
max,110.0,95.0,1000000.0


In [6]:
## replace income's -1.000000 with np.nan
profiles.income = profiles.income.replace(-1, np.nan)

In [7]:
null_df = profiles.isnull().sum().sort_values(ascending=False).reset_index().rename(columns={'index':'feature', 0:'count'})
null_df['percentage'] = null_df.apply(lambda row: row['count']/len(profiles), axis=1)
null_df

Unnamed: 0,feature,count,percentage
0,income,48442,0.808094
1,offspring,35561,0.593217
2,diet,24395,0.40695
3,religion,20226,0.337404
4,pets,19921,0.332316
5,essay8,19225,0.320705
6,drugs,14080,0.234878
7,essay6,13771,0.229723
8,essay9,12603,0.210239
9,essay7,12451,0.207704


In [8]:
## drop unnecessary columns and columns whose missing values percentage higher than 20%
cols_to_drop = profiles.filter(regex='essay.|last_online|income|offspring|diet|religion|pet').columns.tolist()
new_profiles = profiles.drop(cols_to_drop, axis=1)

## create two list of numerical columns and categorical columns
num_cols = new_profiles.select_dtypes(np.number).columns.tolist()
cat_cols = new_profiles.select_dtypes(exclude=np.number).columns.tolist()

## fill new_profiles np.nan with meaningful values
new_profiles[num_cols] = new_profiles[num_cols].apply(lambda x: x.fillna(x.median()))
new_profiles[cat_cols] = new_profiles[cat_cols].apply(lambda x: x.fillna(x.value_counts().index[0]))

In [9]:
## check if there're still Null values
new_profiles.isna().sum().sort_values(ascending=False)

status         0
speaks         0
smokes         0
sign           0
sex            0
orientation    0
location       0
job            0
height         0
ethnicity      0
education      0
drugs          0
drinks         0
body_type      0
age            0
dtype: int64

In [10]:
pd.DataFrame(
    {'Skewness':[new_profiles.age.skew(), new_profiles.height.skew()],
     'Kurtosis':[new_profiles.age.kurtosis(), new_profiles.height.kurtosis()]}, index=['Age', 'Height'])

Unnamed: 0,Skewness,Kurtosis
Age,1.265773,1.572518
Height,-0.463061,7.761588


In [11]:
## exclude rows where height is lower than 40 
new_profiles = new_profiles.loc[(new_profiles.height > 50) & (new_profiles.height < 90) ]

## perform log transformation on age
new_profiles.age = np.log(new_profiles.age)

In [12]:
pd.DataFrame(
    {'Skewness':[new_profiles.age.skew(), new_profiles.height.skew()],
     'Kurtosis':[new_profiles.age.kurtosis(), new_profiles.height.kurtosis()]}, index=['Age', 'Height'])

Unnamed: 0,Skewness,Kurtosis
Age,0.587702,-0.05812
Height,-0.047944,-0.347583


In [13]:
new_profiles[cat_cols].head()

Unnamed: 0,body_type,drinks,drugs,education,ethnicity,job,location,orientation,sex,sign,smokes,speaks,status
0,a little extra,socially,never,college/university,"asian, white",transportation,"south san francisco, california",straight,m,gemini,sometimes,english,single
1,average,often,sometimes,space camp,white,hospitality / travel,"oakland, california",straight,m,cancer,no,"english (fluently), spanish (poorly), french (...",single
2,thin,socially,never,masters program,white,other,"san francisco, california",straight,m,pisces,no,"english, french, c++",available
3,thin,socially,never,college/university,white,student,"berkeley, california",straight,m,pisces,no,"english, german (poorly)",single
4,athletic,socially,never,college/university,"asian, black, other",artistic / musical / writer,"san francisco, california",straight,m,aquarius,no,english,single


In [14]:
## import encoding libraries
from sklearn.preprocessing import LabelEncoder

In [15]:
## encode categorical variables
encoder = LabelEncoder()
for column in cat_cols:
    new_profiles[f'{column}_code'] = encoder.fit_transform(new_profiles[column])

In [16]:
## map target variables
new_profiles.sex = new_profiles.sex.map({'m':0, 'f':1})

In [17]:
## import essential libraries
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
#from imblearn.over_sampling import SMOTENC, SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report, confusion_matrix

In [18]:
X = new_profiles.filter(
    ['age', 'height', 'body_type_code', 'education_code', 'ethnicity_code', 'job_code', 'orientation_code'])
y = new_profiles.sex

In [19]:
## split data into traning set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [20]:
## create a list to store models
estimators = []

## control how data is splitted
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [21]:
params = {
    'C': range(1,20),
    'max_iter':[1000]
}

log_reg = LogisticRegression()
clf_log = GridSearchCV(log_reg, param_grid=params, scoring='f1', cv=cv)
clf_log.fit(X_train, y_train)

print(f'Best parameters: {clf_log.best_params_}')
print(f'Best score: {clf_log.best_score_}')

estimators.append(['Log', clf_log.best_estimator_])

Best parameters: {'C': 6, 'max_iter': 1000}
Best score: 0.8050213251218071


In [22]:
params = {
    'n_neighbors': range(1,10),
}

kneighbors = KNeighborsClassifier()
clf_kneighbors = GridSearchCV(kneighbors, param_grid=params, scoring='f1', cv=cv)
clf_kneighbors.fit(X_train, y_train)

print(f'Best parameters: {clf_kneighbors.best_params_}')
print(f'Best score: {clf_kneighbors.best_score_}')

estimators.append(['KNeighbors', clf_kneighbors.best_estimator_])

Best parameters: {'n_neighbors': 9}
Best score: 0.8154629288478855


In [23]:
params = {
    'criterion':['gini','entropy'],
    'max_depth': np.arange(3, 15)
}

dtree = DecisionTreeClassifier()
clf_dtree = GridSearchCV(dtree, param_grid=params, scoring='f1', cv=cv)
clf_dtree.fit(X_train, y_train)

print(f'Best parameters: {clf_dtree.best_params_}')
print(f'Best score: {clf_dtree.best_score_}')

estimators.append(['Decision Tree', clf_dtree.best_estimator_])

Best parameters: {'criterion': 'entropy', 'max_depth': 9}
Best score: 0.8215244468556955


In [24]:
## perform cross validation using `cross_val_score`
scores_df = {'Model':[], 'Accuracy':[]}
for name, model in estimators:
    accuracies = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
    scores_df['Model'].append(name)
    scores_df['Accuracy'].append(accuracies.mean())

scores_df = pd.DataFrame(scores_df)
scores_df

Unnamed: 0,Model,Accuracy
0,Log,0.846191
1,KNeighbors,0.854839
2,Decision Tree,0.857949


## Section 2 - Prediction based on Essay Columns

In [25]:
#import TfidfVectorizer for preprocessing text data
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
cols_to_classify2 = ['sign','essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']
def create_essay_df(df, cols_to_classify2):
    data = df[cols_to_classify2]
    
    essay_columns = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']
    
    # drop records that have null values
    data = data.dropna()
    
    # delete all html tags, new line escape, and http links in the essay columns
    filled_df = data.replace({r'<[A-Za-z\/][^>]*>' : '', r'\n' : ' ', r'http[^ ]*[ ]' : ' ', r'http[^ ]*' : ''}, regex=True)

    # create the 'combined_essay' column by combining all the essay columns together
    filled_df['combined_essay'] = filled_df[essay_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
              
    return filled_df.drop(essay_columns, axis=1)

In [28]:
user_essay = create_essay_df(profiles, cols_to_classify2)

In [29]:
user_essay.head()

Unnamed: 0,sign,combined_essay
0,gemini,about me: i would love to think that i was so...
5,taurus,"i'm an australian living in san francisco, but..."
9,cancer,my names jake. i'm a creative guy and i look f...
10,taurus,"update: i'm seeing someone, so off the market ..."
11,leo,"i was born in wisconsin, grew up in iowa, and ..."


In [31]:
user_essay.sign.value_counts()

leo            2309
gemini         2299
libra          2298
cancer         2256
virgo          2223
taurus         2190
scorpio        2178
aries          2168
sagittarius    2141
pisces         2087
aquarius       2076
capricorn      1892
Name: sign, dtype: int64

In [33]:
# split data into user and user_labels group
user2 = user_essay['combined_essay']
user_labels2 = user_essay['sign'].copy()

# chage user_labels from text to number
le = LabelEncoder()
user_labels2 = le.fit_transform(user_labels2)
le.classes_

array(['aquarius', 'aries', 'cancer', 'capricorn', 'gemini', 'leo',
       'libra', 'pisces', 'sagittarius', 'scorpio', 'taurus', 'virgo'],
      dtype=object)

In [34]:
# split data into training set and testing set
training_data, testing_data, training_labels, testing_labels = train_test_split(
    user2, user_labels2, test_size=0.2, random_state=33)

In [35]:
# preprocessing data by using TfidfVectorizer
vectorizer = TfidfVectorizer()
prepared_training_data = vectorizer.fit_transform(training_data)

In [37]:
# train MultinomialNB model
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(prepared_training_data, training_labels)
multi_predictions = classifier.predict(prepared_training_data)
print(classification_report(training_labels, multi_predictions))

              precision    recall  f1-score   support

           0       1.00      0.02      0.03      1643
           1       1.00      0.06      0.12      1714
           2       0.65      0.82      0.73      1836
           3       1.00      0.00      0.01      1509
           4       0.73      0.82      0.77      1834
           5       0.25      0.99      0.40      1848
           6       0.24      1.00      0.39      1853
           7       1.00      0.06      0.12      1705
           8       1.00      0.04      0.08      1688
           9       1.00      0.13      0.23      1738
          10       0.98      0.33      0.49      1779
          11       0.99      0.22      0.36      1746

    accuracy                           0.39     20893
   macro avg       0.82      0.37      0.31     20893
weighted avg       0.81      0.39      0.32     20893



In [38]:
print(cross_val_score(classifier, prepared_training_data, training_labels, scoring='f1_macro', cv=5))

[0.03954122 0.03614473 0.03631791 0.03703821 0.0337957 ]
