In [1]:
import pandas as pd

df_raw = pd.read_csv('gender-classification.csv', encoding='latin1')

In [2]:
# 目的変数をいじる

## 目的変数を確認
obj_col = 'gender'
print('生データ\n{}'.format(df_raw[obj_col].value_counts()))

## male と femaleを抽出
df = df_raw.loc[ (df_raw[obj_col]=='male') | (df_raw[obj_col]=='female')]

## indexを振り直さないとlocメソッドでerrorが出る
df = df.reset_index(drop=True)

##ラベルを整数に変換
df.loc[ df[obj_col]=='male' , obj_col] = 0
df.loc[ df[obj_col]=='female' , obj_col] = 1

## 目的変数の型変換
df.loc[:, obj_col ] = df.loc[:, obj_col ].astype(int)

print('修正後のデータ\n{}'.format(df[obj_col].value_counts()))

生データ
female     6700
male       6194
brand      5942
unknown    1117
Name: gender, dtype: int64
修正後のデータ
1    6700
0    6194
Name: gender, dtype: int64


In [3]:
# 説明変数をいじる
## description : 頻度表
## text : 分散表現 (word2vec or fasttext) + 品詞の並び方
## name : 文字単位
## link_color : RGBで各次元を形成
## sidebar_color : RGBで各次元を形成
## fav_number : 不明
## retweet_count : 不明
## profileimage : 学習済みモデルから分類する? コスト高そう そもそも人かどうか判断できる時点で予測するまでもない

## 性別判定の信頼度が1.0を対象に学習する
df = df.loc[ df['gender:confidence'] >=1.0 ]
df = df.reset_index(drop=True)
df = df.dropna(subset=['description'])
df = df.reset_index(drop=True)

print('修正後のデータ2\n{}'.format(df[obj_col].value_counts()))

修正後のデータ2
1    4616
0    4150
Name: gender, dtype: int64


In [4]:
import re

def remove_urls(text):
    
    urls = [g[0] for g in re.findall(r'((https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+\$,%#]+))', text)]
    
    for url in sorted(urls,key=lambda x:-len(x)):
    
        text = text.replace(url,'')
        
    return text

def remove_mentions(text):
    
    mentions = [g[0] for g in re.findall(r'@\w+', text)]
    
    for mention in sorted(mentions,key=lambda x:-len(x)):
    
        text = text.replace(mention,'')
        
    return text

def remove_rt(text):
    
    rts = [g[0] for g in re.findall(r'\s?RT\s', text)]
    
    for rt in sorted(rts,key=lambda x:-len(x)):
    
        text = text.replace(rt,'')
        
    return text

In [5]:
df.description = df.description.map(remove_urls)
df.description = df.description.map(remove_mentions)
df.description = df.description.map(remove_rt)

In [6]:
x = df.description
y = df[obj_col].values

In [7]:
# train test split
from sklearn.model_selection import train_test_split

df_x_train, df_x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 0)

In [8]:
# 頻度ベクトルの獲得
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import numpy as np

mystopwords = stopwords.words('english') + ["i'm","rt"]

cntvec = CountVectorizer(stop_words=mystopwords,token_pattern="(?u)(?=[a-zA-Z])\\b[\w']{2,}\\b")

X_train = cntvec.fit_transform(df_x_train.values.tolist()).toarray().astype(np.float64)
X_test  = cntvec.transform(df_x_test.values.tolist()).toarray().astype(np.float64)

In [9]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin

clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

print('平均予測精度 : {:.2f}\n'.format(clf.score(X_test, y_test)))
print(classification_report(y_test, y_pred, target_names=['male','female']))

平均予測精度 : 0.68

             precision    recall  f1-score   support

       male       0.72      0.60      0.65       442
     female       0.65      0.76      0.70       435

avg / total       0.68      0.68      0.68       877

