In [None]:
import pandas as pd
import numpy as np
import os.path as op
import unicodedata
import sys
from gensim.models import FastText
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict, StratifiedShuffleSplit, train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE, RandomOverSampler


basename = r".." #path to basename
path_utils = op.join(basename , "utils")
sys.path.insert(0, path_utils)

from sys_utils import load_library
load_library(op.join(basename, 'readWrite'))
from readWrite import readFile
from sys_utils import load_library
from tweet_utils import *

from preprocess import Preprocess
prep = Preprocess()

model_we = #load word embedding model

trainingData = #import labeled data for training


print(trainingData.dtypes)
print(trainingData.shape)
trainingData.head(5)

In [None]:
gender_names = pd.read_csv(r"\data\BabyNames_2018_US_SSA.txt", 
                           sep=",", header=None, names=["name", "gender", "OccurencesName"])

print(gender_names.shape)
gender_names.name = gender_names.name.map(lambda name: name.lower())


def choose_most_occuring_name(df):
    if df.shape[0] == 1: return df
    else:
        maxEl = df.OccurencesName.max(axis=0)
        return df[df["OccurencesName"] == maxEl]

# in duplicate cases (Name exist for male and female) take only the one that occurs more often
gender_names = gender_names.groupby("name",as_index=False).apply(choose_most_occuring_name)
gender_names.reset_index(drop=True, inplace=True)
print(gender_names.shape)

gender_names.head(5)


In [None]:
df = pd.read_csv(r"", usecols=["id", "text", "user_description", "user_name", "user_screen_name"]).sample(n=12000, random_state=1) #import data to classify

def get_sex(name):
    try:
        firstName = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8', 'ignore').split(" ")[0].replace(" ", "")
        return firstName.lower()
    except:
        return None 

df["firstName"] = df.user_name.map(get_sex)

print(df.shape)
new = pd.merge(df, gender_names, how='inner', left_on="firstName", right_on="name")
new = new[new.OccurencesName > 500] # take only examples of names that occur often
new = new[["text", "user_description", "user_name", "gender"]]
print(new.shape)


In [None]:
trainingData["History_Sex"]=""

In [None]:
def preprocess_tweet(tweet):
    tweet = prep.replace_hashtags_URL_USER(tweet, mode_URL="replace", mode_Mentions="replace")
    tweet = prep.tokenize(tweet)

    return tweet

def labelEncode(sex):
    if sex == "M": return(0) 
    elif sex == "F": return(1)
    elif sex == "U": return(2)
    else: return(2)


def create_history_sex_column(row):
    if row["History_Sex"] == "M": return "M"
    elif row["History_Sex"] == "F": return "F"
    elif row["History_Sex"] == "U": return "U"
    elif pd.isnull(row["History_Sex"]): return row["Sexe"]
    else: print("ERROR: Should not occur:  ", row["Sexe"], ";;;", row["History_Sex"])

trainingData['history_sex_total'] = trainingData.apply (lambda row: create_history_sex_column(row), axis=1)

sex = trainingData.Sexe
history_sex = trainingData.history_sex_total


In [None]:
# APPEND Tweets from onw tweet database tweets which matched some gender names
print("trainingData.shape:", trainingData.shape)
print("Own data.shape:", new.shape)

label = "history_sex_total"
data_pd = trainingData[["text", "user_description", "user_name", label]]
new[label] = new["gender"]
del new["gender"]

data_pd = data_pd.append(new, ignore_index=True).sample(frac=1.0) # append dataframes and sample
print(data_pd[label].value_counts())
data_pd.head()

In [None]:
#Username, user_description and text to vector representation
data_pd.text = data_pd.text.map(lambda tweet: tweet_vectorizer(preprocess_tweet(tweet), model_we))
data_pd["temp_userDesc"] = data_pd.user_description.map(lambda userDesc: 0 
                                                if isinstance(userDesc, float) or userDesc == " " or userDesc == None
                                                else 1)

data_pd.user_description = data_pd.user_description.map(lambda userDesc: np.zeros((model_we.vector_size, )) 
                                                if isinstance(userDesc, float) or userDesc == " " or userDesc == None
                                                else tweet_vectorizer(preprocess_tweet(userDesc), model_we))

def userName_to_vec(name):
    """ Username to vector representation if possible, otherwise 0-vector """
    try:
        firstName = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8', 'ignore').split(" ")[0].replace(" ", "")
        vec = model_we[firstName]
    except:
        vec = np.zeros((model_we.vector_size, ))
    return vec

def TEMP_userName_to_vec(name):
    try:
        firstName = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8', 'ignore').split(" ")[0].replace(" ", "")
        vec = model_we[firstName]
        return 1
    except:
        return 0


data_pd['temp_user_name'] = data_pd.user_name.map(lambda name: TEMP_userName_to_vec(name))
data_pd.user_name = data_pd.user_name.map(lambda name: userName_to_vec(name))
data_pd[label] = data_pd[label].map(labelEncode)


# remove the tweets that are empty because there is no word embedding
data_pd = data_pd[data_pd["text"].apply(lambda x: len(x)>0) ]
print(data_pd.shape)

data_pd.head(3)

In [None]:
# helper functions for machine learning pipeline
class ItemSelect(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, data):
        return np.asarray(data[self.key].values.tolist())
    
    
class Debug(BaseEstimator, TransformerMixin):
    def __init__(self, message=""):
        self.message = message
        
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        print(self.message)
        print("type:", type(X), "len:", X.shape)
        return X


In [None]:
# choose training algorithm:
#---------------------------------------------------------------------------
modelAlgo = "SVC"

if modelAlgo == "MultinomialNB":
    model = MultinomialNB(random_state=0)
elif modelAlgo == "SVC":
    model = SVC(random_state=0)
elif modelAlgo == "logReg":
    model = LogisticRegression(random_state=0)
elif modelAlgo == "RandomForest" :
    model = RandomForestClassifier(random_state=0)
elif modelAlgo == "XGBoost" :
    model = XGBClassifier(random_state=0)
elif modelAlgo == "MLP" :
    model = MLPClassifier(early_stopping=True, batch_size=32, random_state=0)

# use Pipeline from imblearn package to be able to use SMOTE oversampling    
from imblearn.pipeline import Pipeline    
pipeline  = Pipeline([
                ('union', FeatureUnion(
                            transformer_list = [
                                ('tweet', Pipeline([
                                    ('tweetsSelector', ItemSelect(key='text')),
                                ])),
                                ('userDesc', Pipeline([
                                    ('userDescSelector', ItemSelect(key='user_description'))
                                ])),
                                ('userName', Pipeline([
                                    ('userNameSelector', ItemSelect(key='user_name'))
                                ]))  
                            ],
                )),
                ('smote', SMOTE(random_state=12, sampling_strategy="auto",  n_jobs=-1)), #, ratio = 1.0 
                ('model', model),
            ])


# parameter grid for grid search by using fastText embeddings
parameters = {
                'union__transformer_weights' : [
#                                                {"tweet": 1, "userDesc":1, "userName":1},
#                                                {"tweet": 1, "userDesc":1, "userName":0.8}, 
#                                                {"tweet": 1, "userDesc":1, "userName":0.5}
#                                                {"tweet": 1, "userDesc":1, "userName":0.5},
#                                                {"tweet": 1, "userDesc":1, "userName":0.4},
#                                                {"tweet": 1, "userDesc":0.8, "userName":0.4},
#                                                {"tweet": 1, "userDesc":0.8, "userName":0.5},
#                                                {"tweet": 1, "userDesc":0.8, "userName":0.6},
#                                                {"tweet": 1, "userDesc":0.7, "userName":0.5},
#                                                {"tweet": 1, "userDesc":0.9, "userName":0.5},
#                                                ],
#                                               [
#                                                {"tweet": 1, "userDesc":1, "userName":1}, 
#                                                {"tweet": 1, "userDesc":0.3, "userName":0.3}, 
#                                                {"tweet": 1, "userDesc":0.5, "userName":0.5},
#                                                {"tweet": 1, "userDesc":0.2, "userName":0.5},
#                                                {"tweet": 1, "userDesc":0.7, "userName":0.7},
#                                                {"tweet": 1, "userDesc":0.8, "userName":0.5},
#                                                {"tweet": 1, "userDesc":0.8, "userName":0.8}, 
#                                                {"tweet": 1, "userDesc":0.8, "userName":0.8},
#                                                {"tweet": 1, "userDesc":0.5, "userName":0.8},
#                                                {'tweet': 1, 'userDesc': 0.8, 'userName': 1},
#                                                {"tweet": 1, "userDesc":0, "userName":1},
#                                                {"tweet": 0, "userDesc":0, "userName":1}
                                                ],

#                'smote__k_neighbors' : [4],

#               # param for SVC
#               'model__kernel' : ["linear"],#["linear", "poly", "rbf"],
#               'model__C' : [0.1,],
#               'model__tol' : [1e-3],
#               'model__class_weight' : ["balanced", {1:0.5}, {1:1}, {1:1.5}],
#
#               # param for RandomForestClassifier
#               'model__n_estimators' : [50, 100, 150],
#               'model__criterion' : ['gini', 'entropy'],
#               'model__max_features' : ['auto', 'log2'],
#               'model__max_depth' : [ 5, 10, 20, 30]
#
#               # param for XGBoost Best: 0.910828 using {'model__learning_rate': 0.05, 'model__reg_alpha': 0, 'model__max_depth': 3, 'model__reg_lambda': 1.5, 'model__n_estimators': 300}
#               'model__max_depth' : [3],#[3,4],
#              'model__learning_rate' : [0.001, 0.01, 0.1],
#               'model__booster' : ["gbtree", "gblinear", "dart"], # ["gblinear"], #
#               'model__gamma' : [0, 0.01],
#               'model__n_estimators' : [100, 150],
#               'model__reg_alpha' : [0, 0.1],
#               'model__reg_lambda' : [0.5, 1.0]


}


In [None]:
def label_encode(sex):
    # M = 0, F = 1, U = 2
    if sex == 0: return(-1) 
    else: return(1)

print("data:", data_pd.shape, type(data_pd))
X = data_pd[["text", "user_description", "user_name"]]
y = data_pd[label]

#temp = data_pd.loc[data_pd[label] != 2]
#print("Temp:", temp.shape, type(temp))
#X = temp[["text", "user_description", "user_name"]]
#y = temp[label]#.map(label_encode)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) 
print("X_train:", X_train.shape, type(X_train))
print("X_test:", X_test.shape, type(X_test))
print("y_train:", y_train.shape, type(y_train))
print("y_test:", y_test.shape, type(y_test))


X_train_pd = pd.DataFrame(X_train, columns=["text", "user_description", "user_name"])
X_test_pd = pd.DataFrame(X_test, columns=["text", "user_description", "user_name"])

print(y_train.unique())
print("Start Grid search...")
#from sklearn.metrics import precision_score, roc_auc_score, make_scorer
grid = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-2, verbose=1)#, scoring=prec_scorer)#, scoring="roc_auc")
grid.fit(X_train_pd, y_train)
print("\nBest: %f using %s" % (grid.best_score_, grid.best_params_))

y_pred = grid.best_estimator_.predict(X_test_pd)
#print("F1-Score:", f1_score(y_test, y_pred))
#print("Precision: ",precision_score(y_test, y_pred))
#print("Recall: ", recall_score(y_test, y_pred))    
print("Accuracy: ", accuracy_score(y_test, y_pred))   
print("Performance overall: ")
print(classification_report(y_test, y_pred))

In [None]:
#save model
import joblib
joblib.dump(grid.best_estimator_, 'model name', compress = 1)

In [None]:
#Plot confusion matrix
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    print("Classes before:", classes)
    print("unique labels:", unique_labels(y_true, y_pred))
    classes = classes[unique_labels(y_true, y_pred)]
    print("Classes:", classes)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax



np.set_printoptions(precision=2)

class_names = np.array(["M", "F", "U"])
#class_names = np.array(["M", "F"])

# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

# Append columns for gender and type of diabetes prediction

In [None]:
def preprocess_tweet(tweet):
    tweet = prep.replace_hashtags_URL_USER(tweet, mode_URL="replace", mode_Mentions="replace")
    tweet = prep.tokenize(tweet)
    return tweet

def userName_to_vec(name):
    try:
        firstName = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('utf-8', 'ignore').split(" ")[0].replace(" ", "")
        vec = model_we[firstName]
    except:
        vec = np.zeros((model_we.vector_size, ))
    return vec
df = pd.read_csv(r"") #import dataset 
#del df["emotion"]
#del df["__index_level_1__"]
print(df.shape)
print(df.columns)
print(df.head())

In [None]:
#Apply the gender classifier to data
temp = pd.DataFrame()
temp["text"] = df.text.map(lambda tweet: tweet_vectorizer(preprocess_tweet(tweet), model_we))
temp["user_description"] = df.user_description.map(lambda userDesc: np.zeros((model_we.vector_size, )) 
                                                if isinstance(userDesc, float) or userDesc == " " or userDesc == None
                                                else tweet_vectorizer(preprocess_tweet(userDesc), model_we))
temp["user_name"] = df.user_name.map(lambda name: userName_to_vec(name))

gender_classifier = joblib.load("")

df["gender"] = gender_classifier.predict(temp)
df.head()

In [None]:
#export new dataset as csv
df.to_csv(r"",index=False)