In [1]:
import numpy as np
import pandas as pd
from mlxtend.classifier import StackingCVClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open("data/train.csv") as train_file:
    train = pd.read_csv(train_file)
with open("data/valid.csv") as valid_file:
    valid = pd.read_csv(valid_file)
with open("data/test.csv") as test_file:
    test = pd.read_csv(test_file)
with open("data/username_feature.csv") as username_file:
    username = pd.read_csv(username_file)
with open("data/review_feature.csv") as review_file:
    review = pd.read_csv(review_file)
with open("data/sentiment.csv") as sentiment_file:
    sentiment = pd.read_csv(sentiment_file)
with open("data/avatar_feature.csv") as avatar_file:
    avatar = pd.read_csv(avatar_file)

In [3]:
del train['Unnamed: 0']
del test['Unnamed: 0']
del valid['Unnamed: 0']
del username['Unnamed: 0']
del review['Unnamed: 0']
del sentiment['Unnamed: 0']
del avatar['Unnamed: 0']

In [4]:
train.head()

Unnamed: 0,user_id,gender
0,3899123,1
1,925660559,1
2,4365275,1
3,7937454,1
4,141082488,1


In [5]:
valid.head()

Unnamed: 0,user_id,gender
0,14132908,1
1,13843840,1
2,6228345,1
3,4639982,1
4,767439249,0


In [6]:
test.head()

Unnamed: 0,user_id,gender
0,22928025,1
1,1110808,1
2,36161565,0
3,3827369,1
4,25928221,0


In [7]:
X_train_df = pd.merge(username,train,on='user_id',how='inner')
X_train_df = pd.merge(review,X_train_df,on='user_id',how='inner')
X_train_df = pd.merge(sentiment,X_train_df,on='user_id',how='inner')
X_train_df = pd.merge(avatar,X_train_df,on='user_id',how='inner')
X_train_df.dropna(inplace=True)
len(X_train_df)

5905

In [8]:
X_valid_df = pd.merge(username,valid,on='user_id',how='inner')
X_valid_df = pd.merge(review,X_valid_df,on='user_id',how='inner')
X_valid_df = pd.merge(sentiment,X_valid_df,on='user_id',how='inner')
X_valid_df = pd.merge(avatar,X_valid_df,on='user_id',how='inner')
X_valid_df.dropna(inplace=True)
len(X_valid_df)

1914

In [9]:
X_test_df = pd.merge(username,test,on='user_id',how='inner')
X_test_df = pd.merge(review,X_test_df,on='user_id',how='inner')
X_test_df = pd.merge(sentiment,X_test_df,on='user_id',how='inner')
X_test_df = pd.merge(avatar,X_test_df,on='user_id',how='inner')
X_test_df.dropna(inplace=True)
len(X_test_df)

1943

In [10]:
X_train = X_train_df.iloc[:,1:-1].as_matrix().astype(np.float)
y_train = X_train_df['gender'].as_matrix().astype(np.float)
X_valid = X_valid_df.iloc[:,1:-1].as_matrix().astype(np.float)
y_valid = X_valid_df['gender'].as_matrix().astype(np.float)
X_test = X_test_df.iloc[:,1:-1].as_matrix().astype(np.float)
y_test = X_test_df['gender'].as_matrix().astype(np.float)

In [11]:
X_train_valid = np.concatenate([X_train,X_valid],axis=0)
y_train_valid = np.concatenate([y_train,y_valid],axis=0)

In [12]:
lr = LogisticRegression()

In [13]:
def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    
    y_pred1 = clf.predict(X_test)
    
    print("Scores on test dataset")
    print("Accuracy: " + str(metrics.accuracy_score(y_test,y_pred1))) 
    print("Precision: " + str(metrics.precision_score(y_test, y_pred1)))
    print("Recall: " + str(metrics.recall_score(y_test, y_pred1)))
    print("F1: " + str(metrics.f1_score(y_test,y_pred1)))
    
    y_pred2 = clf.predict(X_train)
    
    print("Scores on train dataset")
    print("Accuracy: " + str(metrics.accuracy_score(y_train,y_pred2)))
    print("Precision: " + str(metrics.precision_score(y_train, y_pred2)))
    print("Recall: " + str(metrics.recall_score(y_train, y_pred2)))
    print("F1: " + str(metrics.f1_score(y_train,y_pred2)))
    
    return y_pred1

In [14]:
clf1 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
clf2 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
clf3 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
clf4 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
sclf = StackingCVClassifier(classifiers=[clf1,clf2,clf3,clf4],meta_classifier=lr)

In [15]:
y_lr = train_and_evaluate(clf1,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8219248584662893
Precision: 0.8605072463768116
Recall: 0.9253246753246753
F1: 0.8917396745932414
Scores on train dataset
Accuracy: 0.9053587415270495
Precision: 0.9071330380326867
Recall: 0.9782572887497941
F1: 0.9413536218101125


In [16]:
y_lrs = train_and_evaluate(sclf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8219248584662893
Precision: 0.8605072463768116
Recall: 0.9253246753246753
F1: 0.8917396745932414
Scores on train dataset
Accuracy: 0.9053587415270495
Precision: 0.9071330380326867
Recall: 0.9782572887497941
F1: 0.9413536218101125


In [17]:
clf1 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
clf2 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
clf3 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
clf4 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
sclf = StackingCVClassifier(classifiers=[clf1,clf2,clf3,clf4],meta_classifier=lr)

In [18]:
y_svm = train_and_evaluate(clf1,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8095728255275347
Precision: 0.8620049504950495
Recall: 0.9045454545454545
F1: 0.8827629911280102
Scores on train dataset
Accuracy: 0.928251694590101
Precision: 0.931412464766677
Recall: 0.9797397463350355
F1: 0.9549650798747692


In [19]:
y_svms = train_and_evaluate(sclf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8095728255275347
Precision: 0.8620049504950495
Recall: 0.9045454545454545
F1: 0.8827629911280102
Scores on train dataset
Accuracy: 0.928251694590101
Precision: 0.931412464766677
Recall: 0.9797397463350355
F1: 0.9549650798747692


In [20]:
clf1 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
clf2 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
clf3 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
clf4 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
sclf = StackingCVClassifier(classifiers=[clf1,clf2,clf3,clf4],meta_classifier=lr)

In [21]:
y_rf = train_and_evaluate(clf1,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.7848687596500258
Precision: 0.8311688311688312
Recall: 0.9142857142857143
F1: 0.8707482993197279
Scores on train dataset
Accuracy: 0.9934774267809183
Precision: 0.9924738219895288
Recall: 0.9991764124526438
F1: 0.9958138389559222


In [22]:
y_rfs = train_and_evaluate(sclf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8095728255275347
Precision: 0.8286516853932584
Recall: 0.9577922077922078
F1: 0.8885542168674697
Scores on train dataset
Accuracy: 0.999744212814938
Precision: 0.9996706734727482
Recall: 1.0
F1: 0.9998353096179182


In [23]:
clf1 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("lg", LogisticRegression())
])
clf2 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("svm", SVC(kernel='linear'))
])
clf3 = Pipeline([
    ("min_max scalar", MinMaxScaler(feature_range=(0.0,1.0))),
    ("rf", RandomForestClassifier())
])
sclf = StackingCVClassifier(classifiers=[clf1,clf2,clf3,clf4],meta_classifier=lr)

In [24]:
y_all = train_and_evaluate(sclf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8286155429747812
Precision: 0.8354641467481935
Recall: 0.9759740259740259
F1: 0.9002695417789757
Scores on train dataset
Accuracy: 0.9457731167668499
Precision: 0.9347190146266359
Recall: 1.0
F1: 0.9662581569314023


In [25]:
standard = StandardScaler()
standard.fit_transform(X_train)
standard.fit_transform(X_test)
standard.fit_transform(X_valid)

array([[-0.82405994, -0.16533907, -0.51306932, ...,  0.        ,
         0.        ,  0.        ],
       [-0.82405994, -0.16533907, -0.51306932, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.31564625, -0.16533907,  2.09181401, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.2960928 , -0.16533907,  1.92244036, ...,  0.        ,
         0.        ,  0.        ],
       [-0.82405994, -0.16533907, -0.51306932, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18138191, -0.16533907, -0.51306932, ...,  0.        ,
         0.        ,  0.        ]])

In [26]:
clf = MLPClassifier(hidden_layer_sizes=[100,100,10],max_iter=1000,random_state=33,alpha=5)
y_nn = train_and_evaluate(clf,X_train_valid,X_test,y_train_valid,y_test)

Scores on test dataset
Accuracy: 0.8244981986618631
Precision: 0.8288535381239714
Recall: 0.9811688311688311
F1: 0.8986024382991377
Scores on train dataset
Accuracy: 0.819542140938739
Precision: 0.8179585152838428
Recall: 0.9873167517707132
F1: 0.8946936338532726


In [28]:
result = pd.DataFrame({'user_id':X_test_df['user_id'],'lr_label':y_lr,'lrs_label':y_lrs,'svm_label':y_svm,
                      'svms_label':y_svms,'rf_label':y_rf,'rfs_label':y_rfs,'mix_label':y_all,'nn_label':y_nn,
                      'true_label':y_test})
result.to_csv('result_my_feature.csv')