In [13]:
!python -m wn download omw-he:1.4


[K
Download (0 bytes) Requesting
[K
Download [                              ] (0/315276 bytes) Receiving
[K
Download [=                             ] (8192/315276 bytes) Receiving
[K
Download [#-                            ] (16384/315276 bytes) Receiving
[K
Download [##-                           ] (24576/315276 bytes) Receiving
[K
Download [###                           ] (32768/315276 bytes) Receiving
[K
Download [###=                          ] (40960/315276 bytes) Receiving
[K
Download [####=                         ] (49152/315276 bytes) Receiving
[K
Download [#####-                        ] (57344/315276 bytes) Receiving
[K
Download [######                        ] (65536/315276 bytes) Receiving
[K
Download [#######                       ] (73728/315276 bytes) Receiving
[K
Download [#######=                      ] (81920/315276 bytes) Receiving
[K
Download [########-                     ] (90112/315276 bytes) Receiving
[K
Download [#########-                    ] 

In [14]:
# --------------------------------------
import pandas as pd
import numpy as np
# --------------------------------------


# --------------------------------------
# ------------- visualizations:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# --------------------------------------


# ---------------------------------------
import sklearn
from sklearn import preprocessing, metrics, pipeline, model_selection, feature_extraction 
from sklearn import naive_bayes, linear_model, svm, neural_network, neighbors, tree
from sklearn import decomposition, cluster

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, r2_score, silhouette_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# ---------------------------------------


# ----------------- output and visualizations: 
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter("ignore", category=ConvergenceWarning)
# show several prints in one cell. This will allow us to condence every trick in one cell.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
pd.pandas.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# ---------------------------------------

In [15]:
# --------------------------------------
# --------- Text analysis and Hebrew text analysis imports:
# vectorizers:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# regular expressions:
import re
# --------------------------------------

In [16]:
from sklearn.linear_model import LogisticRegression
import wn

In [17]:
vectorizers_lst = [(CountVectorizer, {'min_df' : 5, 'max_df' : 1.0, 'ngram_range' : (1, 3)}),
                   (TfidfVectorizer, {'min_df' : 5, 'max_df' : 1.0, 'ngram_range' : (1, 3)}),
                   (CountVectorizer, {'min_df' : 0.01, 'max_df' : 0.99, 'ngram_range' : (1, 2), 'max_features' : 30000}),
                   (TfidfVectorizer, {'min_df' : 0.01, 'max_df' : 0.99, 'ngram_range' : (1, 2), 'max_features' : 30000})]
models = [KNeighborsClassifier(), GaussianNB(), LogisticRegression()]
models_param_lst = [{'n_neighbors' : [3, 5, 7, 9, 11]},
                  {},
                  {'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1.0, 10.0, 100.0], 'solver': ['lbfgs', 'newton-cg', 'liblinear'], 'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 3}, {0: 1, 1: 5}], 'random_state': [41, 42]}]

In [18]:
def preprocessing(df_train):
    #TODO add lemmatiztion and stemming
    
    stories_lst = list(df_train.story)
    
    for i in range(len(stories_lst)):
        story = ""
        #print(f"DEBUG preprocessing ==> before cleaning story:\n{stories_lst[i]}")
        
        for letter in stories_lst[i]:
            if letter == ' ' or (letter >= 'א' and letter <= 'ת'):
                story += letter
        
        #print(f"DEBUG preprocessing ==> after cleaning story:\n{story}")
        #print("----------------------------------------------------------------------")
        stories_lst[i] = story
    
    df_train.story = stories_lst
    return df_train

In [19]:
def vectorization(vectorizer, data, res_col):
    
    cols = vectorizer.get_feature_names_out()
    
    X_train = pd.DataFrame(data.toarray(), columns=cols)

    y_male = np.where(res_col == 'm', 1, 0)
    y_female = np.where(res_col == 'f', 1, 0)
    
    
    #print(f"DEBUG vectorization ==> X_train:\n{X_train}")
    #print("----------------------------------------------------------------------")
    return X_train, y_male, y_female

In [20]:
def get_f1_score(model_name, best_parameters, X_train, y_train):
    
    model = model_name
    model.set_params(**best_parameters)
    model.fit(X_train, y_train)
    
    score = cross_val_score(model, X=X_train, y=y_train, scoring='f1', cv=10).mean()
    
    #print(f"DEBUG get_f1_score ==> model_name: {model}")
    #print(f"DEBUG get_f1_score ==> best_params: {best_parameters}")
    #print(f"DEBUG get_f1_score ==> score: {score}")
    
    return score, model

In [27]:
def predict_df(vectorizer, model, model_params, df_test, gender):
    df = preprocessing(df_test)
    
    vec = vectorizer.transform(df_test["story"])
    
    X_test = pd.DataFrame(vec.toarray(), columns=vectorizer.get_feature_names_out())
    
    y_pred = model.predict(X_test)
    
    if gender == 'm':
        prediction_list = np.where(y_pred == 1, 'm', 'f')
    
    else:
        prediction_list = np.where(y_pred == 0, 'm', 'f')
    
    res_df = df_test.copy()
    res_df["gender"] = prediction_list
    
    return res_df

In [28]:
train_filename = 'annotated_corpus_for_train.csv'
test_filename  = 'corpus_for_test.csv'
df_train = pd.read_csv(train_filename, index_col=None, encoding='utf-8')
df_test  = pd.read_csv(test_filename, index_col=None, encoding='utf-8')

In [29]:
all_models_combinations = []

for vectorizer_cls, v_params in vectorizers_lst:
    print(f"DEBUG main ==> start {str(vectorizer_cls)} vectorizer")
    
    for model_i in range(len(models)):
        print(f"DEBUG main ==> model: {models[model_i]}")
        
        df_train = preprocessing(df_train)
        
        vectorizer = vectorizer_cls(**v_params)
        vectorized_data = vectorizer.fit_transform(df_train.story)
        
        X, y_male, y_female = vectorization(vectorizer, vectorized_data, df_train.gender)
        
        grid_search_model = GridSearchCV(models[model_i], models_param_lst[model_i], cv=10, scoring='f1')
        grid_search_res_male = grid_search_model.fit(X.copy(), y_male)
        
        grid_search_model = GridSearchCV(models[model_i], models_param_lst[model_i], cv=10, scoring='f1')
        grid_search_res_female = grid_search_model.fit(X.copy(), y_female)
        
        f1_male_score, male_trained_model = get_f1_score(models[model_i], grid_search_res_male.best_params_, X, y_male)
        f1_female_score, female_trained_model = get_f1_score(models[model_i], grid_search_res_female.best_params_, X, y_female)
        
        f1_avg_score = (f1_male_score + f1_female_score)/2
        
        if f1_avg_score > 0.60:
            print("---------------------------------------Good model found------------------------------------------")
            print(f"DEBUG main ==> model_chose: {models[model_i]}")
            print(f"DEBUG main ==> best_male_parameters: {grid_search_res_male.best_params_}")
            print(f"DEBUG main ==> best_female_parameters: {grid_search_res_female.best_params_}")
            print(f"DEBUG main ==> male_score: {f1_male_score}")
            print(f"DEBUG main ==> female_score: {f1_female_score}")
            print(f"DEBUG main ==> avg_score: {f1_avg_score}")
            
            df_male = predict_df(vectorizer, male_trained_model, grid_search_res_male.best_params_, df_test, 'm')
            df_female = predict_df(vectorizer, female_trained_model, grid_search_res_male.best_params_, df_test, 'f')
            
        all_models_combinations.append({"f1_avg":f1_avg_score, "female_pred":df_female.gender, "man_pred":df_male.gender, "trained_male":male_trained_model,
                               "trained_female":female_trained_model, "best_male_params":grid_search_res_male.best_params_,
                               "best_female_params":grid_search_res_female.best_params_, "vectorizer":vectorizer_cls, "vectorizer_params": v_params})
        
        print("::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::")

DEBUG main ==> start <class 'sklearn.feature_extraction.text.CountVectorizer'> vectorizer
DEBUG main ==> model: KNeighborsClassifier(n_neighbors=3)
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
DEBUG main ==> model: GaussianNB()
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
DEBUG main ==> model: LogisticRegression(C=100.0, class_weight={0: 1, 1: 1}, random_state=41)
---------------------------------------Good model found------------------------------------------
DEBUG main ==> model_chose: LogisticRegression(C=0.01, class_weight={0: 1, 1: 5}, random_state=41)
DEBUG main ==> best_male_parameters: {'C': 100.0, 'class_weight': {0: 1, 1: 5}, 'penalty': 'l2', 'random_state': 41, 'solver': 'newton-cg'}
DEBUG main ==> best_female_parameters: {'C': 0.01, 'class_weight': {0: 1, 1: 5}, 'penalty': 'l2', 'random_state': 41, 'solver': 'lbfgs'}
DEBUG main ==> male_score: 0.8816719975399602
DEBUG main ==> female_score: 0.5638827893741348
DEBUG

In [34]:
max_score = 0

for model in all_models_combinations:
    if model["f1_avg"] > max_score:
        max_score = model["f1_avg"]

print(max_score)

0.7342755186698735
