In [1]:
import os
import json

import pandas as pd
import numpy as np
import sklearn.svm

from sklearn.preprocessing import *
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, SVR
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

plt.style.use("ggplot")

### Open Data

In [20]:
with open("data/processed/processed_data.json") as pfile:
    info = json.load(pfile)

df = pd.DataFrame(info["df"])
train_indices = info["train_indices"]
test_indices = info["test_indices"]
val_indices = info["val_indices"]
print(np.intersect1d(train_indices, test_indices))
print(np.intersect1d(train_indices, val_indices))
print(np.intersect1d(test_indices, val_indices))

encoder = {
    "gender": LabelEncoder(),
    "status": LabelEncoder()
}

gender_vec = encoder["gender"].fit_transform(df["Gender"])
status_vec = encoder["status"].fit_transform(df["Status"])
age_vec = df["Age"].values

X = df["features"].values.tolist()
X = np.asarray([np.array(x) for x in X])

X_train = X[train_indices]
X_val = X[val_indices]
X_test = X[test_indices]



[]
[]
[]


### Gender Classifier

In [8]:
gender_train = gender_vec[train_indices]
gender_val = gender_vec[val_indices]
gender_test = gender_vec[test_indices]

pipe = Pipeline(steps=[('estimator', SVC())])
params_grid = [{
                'estimator':[SVC(max_iter=10000)],
                'estimator__C': np.logspace(-3, 6, num=20, base=2),
                'estimator__gamma': np.logspace(-3, 6, num=20, base=2),
                'estimator__kernel': ['linear', 'rbf']
                },
                {
                'estimator': [RandomForestClassifier()],
                'estimator__max_depth': list(range(1, 30))
                },
              ]

In [9]:
gender_clf = GridSearchCV(pipe, params_grid)
gender_clf.fit(np.concatenate((X_train, X_val)), 
               np.concatenate((gender_train, gender_val)))

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('estimator',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, verbose=False))],
                                verbose=False),
             iid='deprecated', n_jo...
                                                               min_impurity_split=None,
                                                       

In [10]:
gender_clf.best_params_

{'estimator': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=24, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 'estimator__max_depth': 24}

In [11]:
print(f"Classification report for gender classifier:\n"
      f"{classification_report(gender_test, gender_clf.predict(X_test))}\n")

Classification report for gender classifier:
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       182
           1       0.93      0.92      0.93       327

    accuracy                           0.91       509
   macro avg       0.90      0.90      0.90       509
weighted avg       0.91      0.91      0.91       509




### Status Classifier

In [16]:
status_train = status_vec[train_indices]
status_val = status_vec[val_indices]
status_test = status_vec[test_indices]

pipe = Pipeline(steps=[('estimator', SVC())])
params_grid = [{
                'estimator':[SVC(max_iter=10000)],
                'estimator__C': np.logspace(-3, 6, num=20, base=2),
                'estimator__gamma': np.logspace(-3, 6, num=20, base=2),
                'estimator__kernel': ['linear', 'rbf']
                },
                {
                'estimator': [RandomForestClassifier()],
                'estimator__max_depth': list(range(1, 30))
                },
              ]

In [17]:
status_clf = GridSearchCV(pipe, params_grid)
status_clf.fit(np.concatenate((X_train, X_val)), 
               np.concatenate((status_train, status_val)))

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('estimator',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, verbose=False))],
                                verbose=False),
             iid='deprecated', n_jo...
                                                               min_impurity_split=None,
                                                       

In [21]:
status_clf.best_params_

{'estimator': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=12, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 'estimator__max_depth': 12}

In [22]:
print(f"Classification report for status classifier:\n"
      f"{classification_report(status_test, status_clf.predict(X_test))}\n")

Classification report for status classifier:
              precision    recall  f1-score   support

           0       0.40      0.30      0.35       195
           1       0.25      0.27      0.26       154
           2       0.36      0.44      0.40       160

    accuracy                           0.34       509
   macro avg       0.34      0.34      0.33       509
weighted avg       0.34      0.34      0.34       509




### Age Predictor

In [23]:
age_train = age_vec[train_indices]
age_val = age_vec[val_indices]
age_test = age_vec[test_indices]

svr_clf = GridSearchCV(SVR(max_iter=10000), {'C': np.linspace(1, 50, num=50), 
                               'epsilon': np.logspace(-10, -3, num=7, base=2)})
svr_clf.fit(np.concatenate((X_train, X_val)), 
            np.concatenate((age_train, age_val)))

GridSearchCV(cv=None, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=10000, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
       40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.]),
                         'epsilon': array([0.00097656, 0.00219231, 0.00492157, 0.01104854, 0.02480314,
       0.05568117, 0.125     ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [26]:
svr_clf.best_params_

{'C': 38.0, 'epsilon': 0.125}

In [27]:
print(f"Classification report for age predictor:\n"
      f"MSE = {mean_squared_error(age_test, svr_clf.predict(X_test), squared=True)}\n"
      f"RMSE = {mean_squared_error(age_test, svr_clf.predict(X_test), squared=False)}\n"
      f"R2 Score = {r2_score(age_test, svr_clf.predict(X_test))}\n")

Classification report for age predictor:
MSE = 63.34132804388615
RMSE = 7.958726534055945
R2 Score = 0.7206949704651264

