In [1]:
import numpy as np
import pandas as pd
from mlxtend.classifier import StackingCVClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open("data/train.csv") as file:
    train = pd.read_csv(file)
with open("data/valid.csv") as valid_file:
    valid = pd.read_csv(valid_file)
with open("data/test.csv") as test_file:
    test = pd.read_csv(test_file)
with open("data/aver_review.csv") as file:
    words_count = pd.read_csv(file)
with open("data/aver_figure.csv") as file:
    figuration = pd.read_csv(file)
with open("data/username_feature.csv") as username_file:
    username = pd.read_csv(username_file)
with open("data/review_feature.csv") as review_file:
    review = pd.read_csv(review_file)
with open("data/sentiment.csv") as sentiment_file:
    sentiment = pd.read_csv(sentiment_file)
with open("data/avatar_feature.csv") as avatar_file:
    avatar = pd.read_csv(avatar_file)

In [3]:
del train['Unnamed: 0']
del valid['Unnamed: 0']
del test['Unnamed: 0']
del words_count['Unnamed: 0']
del figuration['Unnamed: 0']
del username['Unnamed: 0']
del review['Unnamed: 0']
del sentiment['Unnamed: 0']
del avatar['Unnamed: 0']

In [4]:
X_train_df = pd.merge(words_count,train,on="user_id",how="inner")
X_train_df = pd.merge(figuration,X_train_df,on="user_id",how="inner")
X_train_df = pd.merge(username,X_train_df,on='user_id',how='inner')
X_train_df = pd.merge(review,X_train_df,on='user_id',how='inner')
X_train_df = pd.merge(sentiment,X_train_df,on='user_id',how='inner')
X_train_df = pd.merge(avatar,X_train_df,on='user_id',how='inner')
X_train_df.dropna(inplace=True)
len(X_train_df)

5905

In [5]:
X_valid_df = pd.merge(words_count,valid,on="user_id",how="inner")
X_valid_df = pd.merge(figuration,X_valid_df,on="user_id",how="inner")
X_valid_df = pd.merge(username,X_valid_df,on='user_id',how='inner')
X_valid_df = pd.merge(review,X_valid_df,on='user_id',how='inner')
X_valid_df = pd.merge(sentiment,X_valid_df,on='user_id',how='inner')
X_valid_df = pd.merge(avatar,X_valid_df,on='user_id',how='inner')
X_valid_df.dropna(inplace=True)
len(X_valid_df)

1914

In [6]:
X_test_df = pd.merge(words_count,test,on="user_id",how="inner")
X_test_df = pd.merge(figuration,X_test_df,on="user_id",how="inner")
X_test_df = pd.merge(username,X_test_df,on='user_id',how='inner')
X_test_df = pd.merge(review,X_test_df,on='user_id',how='inner')
X_test_df = pd.merge(sentiment,X_test_df,on='user_id',how='inner')
X_test_df = pd.merge(avatar,X_test_df,on='user_id',how='inner')
X_test_df.dropna(inplace=True)
len(X_test_df)

1943

In [7]:
X_train = X_train_df.iloc[:,1:-1].as_matrix().astype(np.float)
y_train = X_train_df['gender'].as_matrix().astype(np.float)
X_valid = X_valid_df.iloc[:,1:-1].as_matrix().astype(np.float)
y_valid = X_valid_df['gender'].as_matrix().astype(np.float)
X_test = X_test_df.iloc[:,1:-1].as_matrix().astype(np.float)
y_test = X_test_df['gender'].as_matrix().astype(np.float)

In [8]:
X_train_valid = np.concatenate([X_train,X_valid],axis=0)
y_train_valid = np.concatenate([y_train,y_valid],axis=0)

In [9]:
def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    
    y_pred1 = clf.predict(X_test)
    
    print("Scores on test dataset")
    print("Accuracy: " + str(metrics.accuracy_score(y_test,y_pred1))) 
    print("Precision: " + str(metrics.precision_score(y_test, y_pred1)))
    print("Recall: " + str(metrics.recall_score(y_test, y_pred1)))
    print("F1: " + str(metrics.f1_score(y_test,y_pred1)))
    
    y_pred2 = clf.predict(X_train)
    
    print("Scores on train dataset")
    print("Accuracy: " + str(metrics.accuracy_score(y_train,y_pred2)))
    print("Precision: " + str(metrics.precision_score(y_train, y_pred2)))
    print("Recall: " + str(metrics.recall_score(y_train, y_pred2)))
    print("F1: " + str(metrics.f1_score(y_train,y_pred2)))
    
    return y_pred1

In [10]:
lr = LogisticRegression()

In [11]:
clf1 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
clf2 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
clf3 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
clf4 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
sclf = StackingCVClassifier(classifiers=[clf1,clf2,clf3,clf4],meta_classifier=lr)

In [12]:
y_lr = train_and_evaluate(clf1,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8172928461142563
Precision: 0.8580060422960725
Recall: 0.922077922077922
F1: 0.888888888888889
Scores on train dataset
Accuracy: 0.9045913799718635
Precision: 0.906054598139393
Recall: 0.9785867237687366
F1: 0.9409249287298067


In [13]:
y_lrs = train_and_evaluate(sclf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8172928461142563
Precision: 0.8580060422960725
Recall: 0.922077922077922
F1: 0.888888888888889
Scores on train dataset
Accuracy: 0.9045913799718635
Precision: 0.906054598139393
Recall: 0.9785867237687366
F1: 0.9409249287298067


In [14]:
clf1 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
clf2 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
clf3 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
clf4 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
sclf = StackingCVClassifier(classifiers=[clf1,clf2,clf3,clf4],meta_classifier=lr)

In [15]:
y_svm = train_and_evaluate(clf1,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8064848172928462
Precision: 0.8619402985074627
Recall: 0.9
F1: 0.8805590851334181
Scores on train dataset
Accuracy: 0.928251694590101
Precision: 0.932224662692187
Recall: 0.9787514412782079
F1: 0.9549216552832464


In [16]:
y_svms = train_and_evaluate(sclf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8064848172928462
Precision: 0.8619402985074627
Recall: 0.9
F1: 0.8805590851334181
Scores on train dataset
Accuracy: 0.928251694590101
Precision: 0.932224662692187
Recall: 0.9787514412782079
F1: 0.9549216552832464


In [17]:
clf1 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
clf2 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
clf3 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
clf4 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
sclf = StackingCVClassifier(classifiers=[clf1,clf2,clf3,clf4],meta_classifier=lr)

In [18]:
y_rf = train_and_evaluate(clf1,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.7925887802367473
Precision: 0.8338226658837345
Recall: 0.922077922077922
F1: 0.875732346592661
Scores on train dataset
Accuracy: 0.9929658524107943
Precision: 0.9923076923076923
Recall: 0.99868225992423
F1: 0.995484771365241


In [19]:
y_rfs = train_and_evaluate(sclf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8023674729799279
Precision: 0.8239910313901345
Recall: 0.9545454545454546
F1: 0.884476534296029
Scores on train dataset
Accuracy: 0.9989768512597519
Precision: 0.9988481158466348
Recall: 0.9998352824905288
F1: 0.9993414553836023


In [20]:
clf1 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
clf2 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
clf3 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
sclf = StackingCVClassifier(classifiers=[clf1,clf2,clf3,clf4],meta_classifier=lr)

In [21]:
y_all = train_and_evaluate(sclf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8172928461142563
Precision: 0.8554289142171566
Recall: 0.925974025974026
F1: 0.8893046460866855
Scores on train dataset
Accuracy: 0.9345184806241207
Precision: 0.9342290267145759
Recall: 0.9850107066381156
F1: 0.9589480436177036


In [22]:
standard = StandardScaler()
standard.fit_transform(X_train)
standard.fit_transform(X_test)
standard.fit_transform(X_valid)

array([[-0.82405994, -0.16533907, -0.51306932, ..., -0.84354992,
        -0.45781153, -0.86915348],
       [-0.82405994, -0.16533907, -0.51306932, ..., -0.84354992,
        -0.45781153, -0.86915348],
       [ 1.31564625, -0.16533907,  2.09181401, ...,  0.1059498 ,
        -0.19982644,  0.40349684],
       ...,
       [ 1.2960928 , -0.16533907,  1.92244036, ..., -0.38993019,
        -0.37960935,  0.10022698],
       [-0.82405994, -0.16533907, -0.51306932, ...,  0.39230255,
         0.73782388,  0.43599004],
       [ 1.18138191, -0.16533907, -0.51306932, ...,  0.92151924,
         2.75601791, -0.95038648]])

In [23]:
clf = MLPClassifier(hidden_layer_sizes=[100,10,10],max_iter=1000,random_state=33,alpha=1)
y_nn = train_and_evaluate(clf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8353062274832733
Precision: 0.8530092592592593
Recall: 0.9571428571428572
F1: 0.9020807833537333
Scores on train dataset
Accuracy: 0.86507225987978
Precision: 0.8646408839779005
Recall: 0.9795750288255641
F1: 0.9185265271449533


In [24]:
result = pd.DataFrame({'user_id':X_test_df['user_id'],'lr_label':y_lr,'lrs_label':y_lrs,'svm_label':y_svm,
                      'svms_label':y_svms,'rf_label':y_rf,'rfs_label':y_rfs,'mix_label':y_all,'nn_label':y_nn,
                      'true_label':y_test})
result.to_csv('result_all_feature.csv')