In [1]:
import numpy as np
import pandas as pd
from mlxtend.classifier import StackingCVClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open("data/train.csv") as file:
    train = pd.read_csv(file)
with open("data/valid.csv") as valid_file:
    valid = pd.read_csv(valid_file)
with open("data/test.csv") as test_file:
    test = pd.read_csv(test_file)
with open("data/user_vec.csv") as file:
    user_vec = pd.read_csv(file)
with open("data/aver_review.csv") as file:
    words_count = pd.read_csv(file)
with open("data/aver_figure.csv") as file:
    figuration = pd.read_csv(file)

In [3]:
del train['Unnamed: 0']
del valid['Unnamed: 0']
del test['Unnamed: 0']
del user_vec['Unnamed: 0']
del words_count['Unnamed: 0']
del figuration['Unnamed: 0']

In [4]:
X_train_df = pd.merge(user_vec,train,on="user_id",how="inner")
X_train_df = pd.merge(words_count,X_train_df,on="user_id",how="inner")
X_train_df = pd.merge(figuration,X_train_df,on="user_id",how="inner")
X_train_df.dropna(inplace=True)
len(X_train_df)

7200

In [5]:
X_valid_df = pd.merge(user_vec,valid,on="user_id",how="inner")
X_valid_df = pd.merge(words_count,X_valid_df,on="user_id",how="inner")
X_valid_df = pd.merge(figuration,X_valid_df,on="user_id",how="inner")
X_valid_df.dropna(inplace=True)
len(X_valid_df)

2399

In [6]:
X_test_df = pd.merge(user_vec,test,on="user_id",how="inner")
X_test_df = pd.merge(words_count,X_test_df,on="user_id",how="inner")
X_test_df = pd.merge(figuration,X_test_df,on="user_id",how="inner")
X_test_df.dropna(inplace=True)
len(X_test_df)

2400

In [7]:
X_train = X_train_df.iloc[:,1:-1].as_matrix().astype(np.float)
y_train = X_train_df['gender'].as_matrix().astype(np.float)
X_valid = X_valid_df.iloc[:,1:-1].as_matrix().astype(np.float)
y_valid = X_valid_df['gender'].as_matrix().astype(np.float)
X_test = X_test_df.iloc[:,1:-1].as_matrix().astype(np.float)
y_test = X_test_df['gender'].as_matrix().astype(np.float)

In [8]:
X_train_valid = np.concatenate([X_train,X_valid],axis=0)
y_train_valid = np.concatenate([y_train,y_valid],axis=0)

In [9]:
lr = LogisticRegression()

In [10]:
def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    
    y_pred1 = clf.predict(X_test)
    
    print("Scores on test dataset")
    print("Accuracy: " + str(metrics.accuracy_score(y_test,y_pred1))) 
    print("Precision: " + str(metrics.precision_score(y_test, y_pred1)))
    print("Recall: " + str(metrics.recall_score(y_test, y_pred1)))
    print("F1: " + str(metrics.f1_score(y_test,y_pred1)))
    
    y_pred2 = clf.predict(X_train)
    
    print("Scores on train dataset")
    print("Accuracy: " + str(metrics.accuracy_score(y_train,y_pred2)))
    print("Precision: " + str(metrics.precision_score(y_train, y_pred2)))
    print("Recall: " + str(metrics.recall_score(y_train, y_pred2)))
    print("F1: " + str(metrics.f1_score(y_train,y_pred2)))
    
    return y_pred1

In [11]:
clf1 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
clf2 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
clf3 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
clf4 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
sclf = StackingCVClassifier(classifiers=[clf1,clf2,clf3,clf4],meta_classifier=lr)

In [12]:
y_lr = train_and_evaluate(clf1,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8279166666666666
Precision: 0.8451612903225807
Recall: 0.9440133037694013
F1: 0.8918565069389893
Scores on train dataset
Accuracy: 0.8319616626731952
Precision: 0.8451888094341989
Recall: 0.9467397414277684
F1: 0.8930867634387221


In [13]:
y_lrs = train_and_evaluate(sclf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8279166666666666
Precision: 0.8451612903225807
Recall: 0.9440133037694013
F1: 0.8918565069389893
Scores on train dataset
Accuracy: 0.8319616626731952
Precision: 0.8451888094341989
Recall: 0.9467397414277684
F1: 0.8930867634387221


In [14]:
clf1 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
clf2 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
clf3 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
clf4 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
sclf = StackingCVClassifier(classifiers=[clf1,clf2,clf3,clf4],meta_classifier=lr)

In [15]:
y_svm = train_and_evaluate(clf1,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8320833333333333
Precision: 0.8500749625187406
Recall: 0.9429046563192904
F1: 0.8940867279894875
Scores on train dataset
Accuracy: 0.8380039587457027
Precision: 0.8510289104910996
Recall: 0.9473018549747049
F1: 0.8965884152424021


In [16]:
y_svms = train_and_evaluate(sclf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8320833333333333
Precision: 0.8500749625187406
Recall: 0.9429046563192904
F1: 0.8940867279894875
Scores on train dataset
Accuracy: 0.8380039587457027
Precision: 0.8510289104910996
Recall: 0.9473018549747049
F1: 0.8965884152424021


In [17]:
clf1 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
clf2 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
clf3 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
clf4 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
sclf = StackingCVClassifier(classifiers=[clf1,clf2,clf3,clf4],meta_classifier=lr)

In [18]:
y_rf = train_and_evaluate(clf1,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.7666666666666667
Precision: 0.8315565031982942
Recall: 0.8647450110864745
F1: 0.8478260869565217
Scores on train dataset
Accuracy: 0.994270236482967
Precision: 0.9960657580441197
Recall: 0.9962057335581788
F1: 0.9961357408838614


In [19]:
y_rfs = train_and_evaluate(sclf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.7920833333333334
Precision: 0.8196962273395394
Recall: 0.9273835920177383
F1: 0.8702210663198958
Scores on train dataset
Accuracy: 0.9994791124075425
Precision: 0.9994381233319286
Recall: 0.9998594716132658
F1: 0.9996487530734106


In [20]:
clf1 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
clf2 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
clf3 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
sclf = StackingCVClassifier(classifiers=[clf1,clf2,clf3,clf4],meta_classifier=lr)

In [21]:
y_all = train_and_evaluate(sclf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8308333333333333
Precision: 0.846382556987116
Recall: 0.9467849223946785
F1: 0.8937728937728938
Scores on train dataset
Accuracy: 0.8410251067819564
Precision: 0.8515723270440252
Recall: 0.9513771781899943
F1: 0.8987123324040887


In [22]:
standard = StandardScaler()
standard.fit_transform(X_train)
standard.fit_transform(X_test)
standard.fit_transform(X_valid)

array([[ 1.22995388,  0.49086616,  0.67475654, ..., -0.07243913,
        -0.31102013, -0.68900824],
       [-0.25828434, -0.58213269, -0.11499057, ...,  1.75319954,
        -0.13494346,  0.86757924],
       [-0.15293   , -0.35438168, -0.49752432, ..., -1.20739211,
        -0.99716936, -0.63793663],
       ...,
       [-0.4754198 , -0.87808691,  1.92107619, ...,  0.40951746,
        -0.2000165 ,  0.3843142 ],
       [ 0.22201597,  0.46666271,  0.64390704, ...,  0.51796172,
         0.57638096, -0.23169261],
       [-0.66691896, -0.70442018, -0.54688351, ..., -0.30776511,
        -2.17868485, -0.05897368]])

In [23]:
clf = MLPClassifier(hidden_layer_sizes=[20,20,10],max_iter=1000,random_state=33)
y_nn = train_and_evaluate(clf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8254166666666667
Precision: 0.8654353562005277
Recall: 0.9090909090909091
F1: 0.8867261422005949
Scores on train dataset
Accuracy: 0.8446713199291593
Precision: 0.8749500066657779
Recall: 0.9222878021360315
F1: 0.8979954847095847


In [24]:
result = pd.DataFrame({'user_id':X_test_df['user_id'],'lr_label':y_lr,'lrs_label':y_lrs,'svm_label':y_svm,
                      'svms_label':y_svms,'rf_label':y_rf,'rfs_label':y_rfs,'mix_label':y_all,'nn_label':y_nn,
                      'true_label':y_test})
result.to_csv('result_given_feature.csv')