In [1]:
import pandas as pd

df_raw = pd.read_csv('gender-classification.csv', encoding='latin1')

In [2]:
# 目的変数をいじる

## 目的変数を確認
obj_col = 'gender'
print('生データ\n{}'.format(df_raw[obj_col].value_counts()))

## male と femaleを抽出
df = df_raw.loc[ (df_raw[obj_col]=='male') | (df_raw[obj_col]=='female')]

## indexを振り直さないとlocメソッドでerrorが出る
df = df.reset_index(drop=True)

##ラベルを整数に変換
df.loc[ df[obj_col]=='male' , obj_col] = 0
df.loc[ df[obj_col]=='female' , obj_col] = 1

## 目的変数の型変換
df.loc[:, obj_col ] = df.loc[:, obj_col ].astype(int)

print('修正後のデータ\n{}'.format(df[obj_col].value_counts()))

生データ
female     6700
male       6194
brand      5942
unknown    1117
Name: gender, dtype: int64
修正後のデータ
1    6700
0    6194
Name: gender, dtype: int64


In [3]:
# 説明変数をいじる
## description : 頻度表
## text : 分散表現 (word2vec or fasttext) + 品詞の並び方
## name : 文字単位
## link_color : RGBで各次元を形成
## sidebar_color : RGBで各次元を形成
## fav_number : 不明
## retweet_count : 不明
## profileimage : 学習済みモデルから分類する? コスト高そう そもそも人かどうか判断できる時点で予測するまでもない

## 性別判定の信頼度が1.0を対象に学習する
df = df.loc[ df['gender:confidence'] >=1.0 ]
df = df.reset_index(drop=True)

In [4]:
def convert_hex_to_rgb_vector(hex_color, column='', normalize=1):
    
    if len(hex_color) == 6:
        r = int(hex_color[0:2], 16) / normalize
        g = int(hex_color[2:4], 16) / normalize
        b = int(hex_color[4:6], 16) / normalize
    else:
        r,g,b = 0, 0, 0
        
    if column != '':
        return pd.Series({column+'_r':r, column+'_g':g, column+'_b':b})
    else:
        return pd.Series({'r':r, 'g':g, 'b':b})

In [5]:
column = 'link_color'
df = pd.concat( [df, df[column].apply(convert_hex_to_rgb_vector, column=column, normalize=255)], axis=1 )
column = 'sidebar_color'
df = pd.concat( [df, df[column].apply(convert_hex_to_rgb_vector, column=column, normalize=255)], axis=1 )

In [6]:
#説明変数を以下のカラムとする
exp_col = ['link_color_r', 
                  'link_color_g', 
                  'link_color_b', 
                  'sidebar_color_r', 
                  'sidebar_color_g', 
                  'sidebar_color_b']

In [7]:
x = df[exp_col].values
y = df[obj_col].values

In [8]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 0)

In [9]:
# ロジスティック回帰モデル
from sklearn.linear_model import LogisticRegression

In [10]:
# 5-fold cross validation
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold

n = X_train.shape[1]

#標準化と学習のパイプライン
pipe = Pipeline([('scaler',StandardScaler()),("LR", LogisticRegression(random_state=0))])
pipe.fit(X_train,y_train)

#パラメータは10^nオーダーで変化させる
params = {'LR__C':[0.0001, 0.001, 0.01, 0.1],'LR__penalty':['l1', 'l2'],'LR__class_weight':[None,'balanced']}

#グリッドサーチ
grid = GridSearchCV(pipe, param_grid = params, cv = 5, scoring='accuracy',n_jobs=10)
grid.fit(X_train,y_train)

print('Best cross-validation accuracy: {:.2f}'.format(grid.best_score_))
print('Train set score: {:.2f}'.format(grid.score(X_train,y_train)))
print('Best parameters : {}'.format(grid.best_params_))

Best cross-validation accuracy: 0.57
Train set score: 0.57
Best parameters : {'LR__C': 0.01, 'LR__class_weight': 'balanced', 'LR__penalty': 'l1'}


In [11]:
#最適化したパラメータで学習する
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, penalty='l1', C=0.01, class_weight='balanced')

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [12]:
from sklearn.metrics import classification_report

print('平均予測精度 : {:.2f}\n'.format(clf.score(X_test, y_test)))
print(classification_report(y_test, y_pred, target_names=['male','female']))

平均予測精度 : 0.59

             precision    recall  f1-score   support

       male       0.54      0.72      0.62       461
     female       0.67      0.49      0.56       541

avg / total       0.61      0.59      0.59      1002

