In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("Data/responses.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()

In [59]:
model_features = df[['Daily events','Prioritising workload','Writing notes','Workaholism','Thinking ahead',
                     'Final judgement','Reliability','Keeping promises','Loss of interest','Friends versus money',
                     'Funniness','Fake','Criminal damage','Decision making','Elections','Self-criticism',
                     'Judgment calls','Hypochondria','Empathy','Eating to survive','Giving','Compassion to animals',
                     'Borrowed stuff','Loneliness','Cheating in school','Health','Changing the past','God','Dreams',
                     'Charity','Number of friends','Waiting','New environment','Mood swings',
                     'Appearence and gestures','Socializing','Achievements','Responding to a serious letter',
                     'Children','Assertiveness','Getting angry','Knowing the right people','Public speaking',
                     'Unpopularity','Life struggles','Happiness in life','Energy levels','Small - big dogs',
                     'Personality','Finding lost valuables','Getting up','Interests or hobbies',
                     "Parents' advice",'Questionnaires or polls']]

In [63]:
model_target = df['Gender']

In [64]:
file_name = 'log_model_gender_personality_MOSHER.sav'

In [67]:
def logit_model (model_features, model_target, file_name):
    X = model_features
    y = model_target
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    X_scaler = MinMaxScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train) 
    param_grid = {'C': [1, 5, 10], 'penalty': ["l1", "l2"]}
    grid = GridSearchCV(classifier, param_grid, verbose=3)
    grid.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    predict_and_actual = pd.DataFrame({"Prediction": predictions, "Actual": y_test})
    filename = file_name
    joblib.dump(grid, filename)
    return predict_and_actual

In [68]:
logit_model(model_features, model_target, file_name)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] C=1, penalty=l1 .................................................
[CV] ..................... C=1, penalty=l1, score=0.751, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ..................... C=1, penalty=l1, score=0.757, total=   0.0s
[CV] C=1, penalty=l1 .................................................
[CV] ..................... C=1, penalty=l1, score=0.737, total=   0.0s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.746, total=   0.0s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.751, total=   0.0s
[CV] C=1, penalty=l2 .................................................
[CV] ..................... C=1, penalty=l2, score=0.743, total=   0.0s
[CV] C=5, penalty=l1 .................................................
[CV] ............



[CV] .................... C=10, penalty=l2, score=0.751, total=   0.0s
[CV] C=10, penalty=l2 ................................................
[CV] .................... C=10, penalty=l2, score=0.751, total=   0.0s
[CV] C=10, penalty=l2 ................................................
[CV] .................... C=10, penalty=l2, score=0.743, total=   0.0s


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:    0.4s finished


Unnamed: 0,Prediction,Actual
901,female,female
425,female,female
919,female,female
808,male,male
535,male,male
...,...,...
455,female,female
653,female,female
758,male,male
135,male,male
