## assignment 4 - Nlp ex - Labeling gender from hebrew text. 

In [1]:
import pandas as pd
import numpy as np
# ---------------------------------------
import sklearn
from sklearn import preprocessing, metrics, pipeline, model_selection, feature_extraction 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import Perceptron, SGDClassifier

# ----------------- output and visualizations: 
import warnings
from sklearn.exceptions import ConvergenceWarning
# show several prints in one cell. This will allow us to condence every trick in one cell.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline


#### Text analysis and String manipulation imports:
###### Stop words are not allowed

In [2]:
# --------- Text analysis and Hebrew text analysis imports:
# vectorizers:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# regular expressions:
import re

## Understanding The Data:
#### Train Dataset - includes labeled corpus with hebrew stories and target values (gender). 
   #### - Every story between 300-500 words.
#### Test Dataset (not labeled) - been asked to predict the gender of the story writter.

In [3]:
train_filename = 'annotated_corpus_for_train.csv'
df_train = pd.read_csv(train_filename, index_col=None, encoding='utf-8')
df_train.head()
df_train.shape

Unnamed: 0,story,gender
0,"כשחבר הזמין אותי לחול, לא באמת חשבתי שזה יקרה,...",m
1,לפני שהתגייסתי לצבא עשיתי כל מני מיונים ליחידו...,m
2,מאז שהתחילו הלימודים חלומו של כל סטודנט זה הפנ...,f
3,"כשהייתי ילד, מטוסים היה הדבר שהכי ריתק אותי. ב...",m
4,‏הייתי מדריכה בכפר נוער ומתאם הכפר היינו צריכי...,f


(753, 2)

#### Removing punctuations and duplications

In [4]:
import string
#removing punctuations 
def remove_punc(text):
    for punc in string.punctuation:
        text = text.replace(punc, '')
    return text
#
df_train['story'] = df_train['story'].apply(remove_punc)
#checking if there are duplicated rows, in case there are any duplicated rows - im only keeping the last one 
df_train = df_train.drop_duplicates(subset=['story'], keep='last')
df_train['story'] = df_train['story'].str.replace('[a-zA-Z0-9]', ' ')
#turning the labels of m/f to 0s & 1s
df_train['label'] = df_train['gender'].map({'m': 0,'f': 1})

df_train.shape


(749, 3)

In [5]:
print(df_train['gender'].value_counts())


m    572
f    177
Name: gender, dtype: int64


#### Splitting the train Dataframe

In [6]:
def split_train(df):
    X = df_train['story'].copy()
    y = df_train['label'].copy()
    return X,y

In [7]:
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42, shuffle=False ) 
    return X_train, X_test, y_train, y_test

In [8]:
X,y = split_train(df_train)
X_train, X_test, y_train, y_test = split_data(X, y)


In [9]:
print("X_train shape is:", X_train.shape)
print("y_train shape is:", y_train.shape)
print("X_test shape is:", X_test.shape)
print("y_test shape is:", y_test.shape)

X_train shape is: (674,)
y_train shape is: (674,)
X_test shape is: (75,)
y_test shape is: (75,)


#### I have been asked to get f1 macro score higher than 74:

In [10]:
#using a pipeline, built this one with perceptron as my classifier & tfidf - f1 score - 88.

train_pipe = Pipeline([

    ('vectorizer' , TfidfVectorizer(analyzer = 'word' ,ngram_range=(1, 3),max_df=0.8, token_pattern=r'\b\w{2,15}\b')),
    ('clf' , Perceptron(random_state = 42,alpha=0.001, penalty='elasticnet', tol = 1e-7, shuffle = True ,
                        eta0 = 0.101))  
])
train_pipe.fit(X_train,y_train);
y_pred = train_pipe.predict(X_test);

In [11]:
confusion_matrix(y_test,y_pred)
clf_rep = metrics.classification_report(y_test,y_pred)
print(clf_rep)
print('The f1 marco score is:', f1_score(y_test, y_pred, average='macro'))

array([[57,  1],
       [ 8,  9]], dtype=int64)

              precision    recall  f1-score   support

           0       0.88      0.98      0.93        58
           1       0.90      0.53      0.67        17

    accuracy                           0.88        75
   macro avg       0.89      0.76      0.80        75
weighted avg       0.88      0.88      0.87        75

The f1 marco score is: 0.7967479674796748


In [12]:
# tried using sgd classifier as well - f1 score - 87

train_pipe = Pipeline([
    
    ('vectorizer' , TfidfVectorizer(analyzer = 'word' ,ngram_range=(1, 3),max_df=0.7, token_pattern=r'\b\w{3,10}\b')),
    ('clf' , SGDClassifier(random_state = 42,alpha=0.006, loss='perceptron'))
    
])
train_pipe.fit(X_train,y_train);
y_pred = train_pipe.predict(X_test);


In [13]:
confusion_matrix(y_test,y_pred)
clf_rep = metrics.classification_report(y_test,y_pred)
print(clf_rep)
print('The f1 marco score is:', f1_score(y_test, y_pred, average='macro'))

array([[55,  3],
       [ 7, 10]], dtype=int64)

              precision    recall  f1-score   support

           0       0.89      0.95      0.92        58
           1       0.77      0.59      0.67        17

    accuracy                           0.87        75
   macro avg       0.83      0.77      0.79        75
weighted avg       0.86      0.87      0.86        75

The f1 marco score is: 0.7916666666666667
