In [1]:
## importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score


# Clean data, make new table

In [2]:
## Loading csv into pandas dataframe
cupid_df = pd.read_csv(r"C:\Users\sghad\OneDrive\Desktop\DA\Final Project\data_set\okcupid_data.csv")

In [3]:
cupid_new_df = cupid_df[['diet', 'drinks', 'drugs', 'smokes', 'body_type']]

In [4]:
cupid_new_clean = cupid_new_df.dropna()

In [5]:
cupid_new_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25202 entries, 2 to 59945
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   diet       25202 non-null  object
 1   drinks     25202 non-null  object
 2   drugs      25202 non-null  object
 3   smokes     25202 non-null  object
 4   body_type  25202 non-null  object
dtypes: object(5)
memory usage: 1.2+ MB


In [6]:
# Encode categorical variables using one-hot encoding : to create binary columns for each unique value in the categorical columns
columns_to_encode = ['diet', 'drinks', 'drugs', 'smokes']
cupid_encoded = pd.get_dummies(cupid_new_clean, columns=columns_to_encode, prefix=columns_to_encode)

In [18]:
cupid_encoded.body_type

2         average
27        average
29          curvy
30           thin
31        average
           ...   
59934    athletic
59937    athletic
59942        thin
59943    athletic
59945     average
Name: body_type, Length: 25202, dtype: object

In [7]:
# label encoding for the target variable
le = LabelEncoder()
cupid_encoded['body_type_encoded'] = le.fit_transform(cupid_encoded['body_type'])

In [13]:
# Split the data into features (X) and target variable (y)
X = cupid_encoded.drop(['body_type', 'body_type_encoded' ], axis=1)
y = cupid_encoded['body_type_encoded']

X

Unnamed: 0,diet_anything,diet_halal,diet_kosher,diet_mostly anything,diet_mostly halal,diet_mostly kosher,diet_mostly other,diet_mostly vegan,diet_mostly vegetarian,diet_other,...,drinks_socially,drinks_very often,drugs_never,drugs_often,drugs_sometimes,smokes_no,smokes_sometimes,smokes_trying to quit,smokes_when drinking,smokes_yes
2,True,False,False,False,False,False,False,False,False,False,...,True,False,False,False,True,False,True,False,False,False
27,False,False,False,True,False,False,False,False,False,False,...,False,False,True,False,False,True,False,False,False,False
29,False,False,False,True,False,False,False,False,False,False,...,True,False,False,False,True,True,False,False,False,False
30,True,False,False,False,False,False,False,False,False,False,...,True,False,True,False,False,False,True,False,False,False
31,True,False,False,False,False,False,False,False,False,False,...,True,False,True,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59934,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,True,False,False,False,False
59937,False,False,False,False,False,False,False,False,False,False,...,True,False,True,False,False,True,False,False,False,False
59942,False,False,False,True,False,False,False,False,False,False,...,False,False,True,False,False,True,False,False,False,False
59943,True,False,False,False,False,False,False,False,False,False,...,True,False,True,False,False,True,False,False,False,False


In [14]:
X.columns

Index(['diet_anything', 'diet_halal', 'diet_kosher', 'diet_mostly anything',
       'diet_mostly halal', 'diet_mostly kosher', 'diet_mostly other',
       'diet_mostly vegan', 'diet_mostly vegetarian', 'diet_other',
       'diet_strictly anything', 'diet_strictly halal', 'diet_strictly kosher',
       'diet_strictly other', 'diet_strictly vegan',
       'diet_strictly vegetarian', 'diet_vegan', 'diet_vegetarian',
       'drinks_desperately', 'drinks_not at all', 'drinks_often',
       'drinks_rarely', 'drinks_socially', 'drinks_very often', 'drugs_never',
       'drugs_often', 'drugs_sometimes', 'smokes_no', 'smokes_sometimes',
       'smokes_trying to quit', 'smokes_when drinking', 'smokes_yes'],
      dtype='object')

In [15]:
# Handle missing values
imputer = SimpleImputer()
X_imputed = imputer.fit_transform(X)

In [16]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=0)

# Logistic Regression

In [17]:
lr_model = LogisticRegression(multi_class='multinomial', max_iter=1000)
lr_model.fit(X_train, y_train)
lr_predictions_train = lr_model.predict(X_train)
lr_predictions_test = lr_model.predict(X_test)

print("Logistic Regression - Train Set")
print(classification_report(y_train, lr_predictions_train))

print("Logistic Regression - Test Set")
print(classification_report(y_test, lr_predictions_test))

Logistic Regression - Train Set
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1058
           1       0.27      0.13      0.17      4276
           2       0.28      0.82      0.42      5424
           3       0.00      0.00      0.00      1492
           4       0.30      0.16      0.20      4578
           5       1.00      0.00      0.01       367
           6       0.33      0.01      0.02       162
           7       0.00      0.00      0.00       186
           8       0.00      0.00      0.00        63
           9       0.00      0.00      0.00       643
          10       0.12      0.00      0.00      1765
          11       0.15      0.03      0.05       147

    accuracy                           0.28     20161
   macro avg       0.21      0.10      0.07     20161
weighted avg       0.23      0.28      0.20     20161

Logistic Regression - Test Set
              precision    recall  f1-score   support

           0   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# K-Nearest Neighbor

In [19]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_predictions_train = knn_model.predict(X_train)
knn_predictions_test = knn_model.predict(X_test)

print("K-Nearest Neighbors - Train Set")
print(classification_report(y_train, knn_predictions_train))

print("K-Nearest Neighbors - Test Set")
print(classification_report(y_test, knn_predictions_test))

K-Nearest Neighbors - Train Set
              precision    recall  f1-score   support

           0       0.06      0.09      0.07      1058
           1       0.23      0.42      0.30      4276
           2       0.29      0.48      0.36      5424
           3       0.22      0.03      0.05      1492
           4       0.27      0.07      0.11      4578
           5       0.14      0.01      0.02       367
           6       0.33      0.02      0.05       162
           7       0.71      0.03      0.05       186
           8       0.00      0.00      0.00        63
           9       0.30      0.05      0.08       643
          10       0.21      0.04      0.07      1765
          11       0.50      0.01      0.01       147

    accuracy                           0.25     20161
   macro avg       0.27      0.10      0.10     20161
weighted avg       0.25      0.25      0.20     20161

K-Nearest Neighbors - Test Set


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.03      0.05      0.04       254
           1       0.22      0.41      0.29      1090
           2       0.28      0.45      0.35      1378
           3       0.10      0.01      0.02       351
           4       0.21      0.06      0.09      1164
           5       0.00      0.00      0.00        97
           6       0.00      0.00      0.00        29
           7       0.00      0.00      0.00        41
           8       0.00      0.00      0.00        15
           9       0.03      0.01      0.01       161
          10       0.12      0.02      0.04       426
          11       0.00      0.00      0.00        35

    accuracy                           0.23      5041
   macro avg       0.08      0.08      0.07      5041
weighted avg       0.19      0.23      0.19      5041



  _warn_prf(average, modifier, msg_start, len(result))


# Decision Tree

In [None]:
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_predictions_train = tree_model.predict(X_train)
tree_predictions_test = tree_model.predict(X_test)

print("Decision Tree - Train Set")
print(classification_report(y_train, tree_predictions_train))

print("Decision Tree - Test Set")
print(classification_report(y_test, tree_predictions_test))

# Cross-validation
lr_cv_scores = cross_val_score(lr_model, X_imputed, y, cv=5)
print("Logistic Regression - Cross-Validation Scores:", lr_cv_scores)

knn_cv_scores = cross_val_score(knn_model, X_imputed, y, cv=5)
print("K-Nearest Neighbors - Cross-Validation Scores:", knn_cv_scores)

tree_cv_scores = cross_val_score(tree_model, X_imputed, y, cv=5)
print("Decision Tree - Cross-Validation Scores:", tree_cv_scores)

Decision Tree - Train Set
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1058
           1       1.00      1.00      1.00      4276
           2       1.00      1.00      1.00      5424
           3       1.00      1.00      1.00      1492
           4       1.00      1.00      1.00      4578
           5       1.00      1.00      1.00       367
           6       1.00      1.00      1.00       162
           7       1.00      1.00      1.00       186
           8       1.00      1.00      1.00        63
           9       1.00      1.00      1.00       643
          10       1.00      1.00      1.00      1765
          11       1.00      1.00      1.00       147

    accuracy                           1.00     20161
   macro avg       1.00      1.00      1.00     20161
weighted avg       1.00      1.00      1.00     20161

Decision Tree - Test Set
              precision    recall  f1-score   support

           0       1.00    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression - Cross-Validation Scores: [0.99504067 0.99484229 0.99484127 0.99642857 0.9952381 ]
K-Nearest Neighbors - Cross-Validation Scores: [0.99920651 0.99662765 0.99781746 0.99880952 0.99742063]
Decision Tree - Cross-Validation Scores: [1. 1. 1. 1. 1.]


Precision: the ratio of true positive predictions to the total number of positive predictions

Precision = True Positives / (True Positives + False Positives)

True rate:

Recall = True Positives / (True Positives + False Negatives)

F1:combines precision and recall into a single metric, providing a balanced measure of the model's performance. F1-score considers both false 
positives and false negatives and is useful when there is an imbalance between the classes in the dataset.

F1-score = 2 * (Precision * Recall) / (Precision + Recall)
