In [9]:
import os
import json

import pandas as pd
import numpy as np
import sklearn.svm

from sklearn.preprocessing import *
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, SVR
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

plt.style.use("ggplot")

### Open Data

In [10]:
with open("data/processed/processed_data.json") as pfile:
    info = json.load(pfile)

df = pd.DataFrame(info["df"])
train_indices = info["train_indices"]
test_indices = info["test_indices"]
val_indices = info["val_indices"]

print(np.intersect1d(train_indices, test_indices))
print(np.intersect1d(train_indices, val_indices))
print(np.intersect1d(test_indices, val_indices))

print(np.intersect1d(train_indices, test_indices))
encoder = {
    "gender": LabelEncoder(),
    "status": LabelEncoder(),
    "age": KBinsDiscretizer(encode="ordinal")
}

gender_vec = encoder["gender"].fit_transform(df["Gender"])
status_vec = encoder["status"].fit_transform(df["Status"])
age_vec = encoder["age"].fit_transform(df["Age"].values.reshape(-1, 1)).squeeze()

X = df["features"].values.tolist()
X = np.asarray([np.array(x) for x in X])

X_train = X[train_indices]
X_val = X[val_indices]
X_test = X[test_indices]

[]


### Gender Classifier

In [3]:
gender_train = gender_vec[train_indices]
gender_val = gender_vec[val_indices]
gender_test = gender_vec[test_indices]

pipe = Pipeline(steps=[('estimator', SVC())])
params_grid = [{
                'estimator':[SVC()],
                'estimator__kernel': ['linear', 'rbf']
                },
                {
                'estimator': [RandomForestClassifier()],
                'estimator__max_depth': list(range(1, 30))
                },
              ]

In [4]:
gender_clf = GridSearchCV(pipe, params_grid)
gender_clf.fit(np.concatenate((X_train, X_val)), 
               np.concatenate((gender_train, gender_val)))

GridSearchCV(estimator=Pipeline(steps=[('estimator', SVC())]),
             param_grid=[{'estimator': [SVC()],
                          'estimator__kernel': ['linear', 'rbf']},
                         {'estimator': [RandomForestClassifier(max_depth=20)],
                          'estimator__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14, 15, 16,
                                                   17, 18, 19, 20, 21, 22, 23,
                                                   24, 25, 26, 27, 28, 29]}])

In [5]:
gender_clf.best_params_

{'estimator': RandomForestClassifier(max_depth=20), 'estimator__max_depth': 20}

In [6]:
print(f"Classification report for gender classifier:\n"
      f"{classification_report(gender_test, gender_clf.predict(X_test))}\n")

Classification report for gender classifier:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       814
           1       0.99      1.00      1.00      1476

    accuracy                           0.99      2290
   macro avg       0.99      0.99      0.99      2290
weighted avg       0.99      0.99      0.99      2290




### Status Classifier

In [7]:
status_train = status_vec[train_indices]
status_val = status_vec[val_indices]
status_test = status_vec[test_indices]

pipe = Pipeline(steps=[('estimator', SVC())])
params_grid = [{
                'estimator':[SVC()],
                'estimator__kernel': ['linear', 'rbf']
                },
                {
                'estimator': [RandomForestClassifier()],
                'estimator__max_depth': list(range(1, 30))
                },
              ]

In [None]:
status_clf = GridSearchCV(pipe, params_grid)
status_clf.fit(np.concatenate((X_train, X_val)), 
               np.concatenate((status_train, status_val)))

In [14]:
status_clf.best_params_

NameError: name 'status_clf' is not defined

In [13]:
from sklearn.metrics import classification_report

print(f"Classification report for status classifier:\n"
      f"{classification_report(status_test, status_clf.predict(X_test))}\n")

NameError: name 'status_clf' is not defined

### Age Predictor

In [43]:
age_train = age_vec[train_indices]
age_val = age_vec[val_indices]
age_test = age_vec[test_indices]

svr_clf = GridSearchCV(SVR(), {'C':list(range(10, 40, 5)), 'epsilon': [0.01, 0.05, 0.1]})
svr_clf.fit(np.concatenate((X_train, X_val)), 
            np.concatenate((age_train, age_val)))

GridSearchCV(estimator=SVR(),
             param_grid={'C': [10, 15, 20, 25, 30, 35],
                         'epsilon': [0.01, 0.05, 0.1]})

In [40]:
svr_clf.best_params_

{'C': 30, 'epsilon': 0.2}

In [38]:
print(f"Classification report for age predictor:\n"
      f"MSE = {mean_squared_error(age_test, svr_clf.predict(X_test), squared=True)}\n"
      f"RMSE = {mean_squared_error(age_test, svr_clf.predict(X_test), squared=False)}\n"
      f"R2 Score = {r2_score(age_test, svr_clf.predict(X_test))}\n")

Classification report for age predictor:
MSE = 0.1494349311754228
RMSE = 0.386568145577753
R2 Score = 0.9254115682318192

