In [None]:
import json
import pandas as pd
from src.comparator import compare_metadata, Comparator, print_diff, print_correctly_extracted
import src.datasets as load
from src.kuba_information_extractor import KubaInformationExtractor
import pprint

DATA_PATH = 'data/'
data_sets = load.load(DATA_PATH)

import numpy as np  
import re  
from sklearn.datasets import load_files  
import pickle  

from src.morf_utils import MorfWrapper
from src.compare_utils import deogonkify
import morfeusz2

translation = {"female": 1, "male": 2, "any": 0}

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

In [None]:
data_sets = [d for d in data_sets if not d["meta"]["preferredGender"] == "male"]

In [None]:
X = []
y = []

for d in data_sets:
    content = d["content"]
    category = d["meta"]["preferredGender"]
    if category is None:
        category = 'any'
    X.append(content)
    y.append(translation[category])
    
y = np.array(y)

display(y)

In [None]:
documents = []

for sen in range(0, len(X)):  
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    documents.append(document)

In [None]:

stopwords = load.load_stopwords();

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidfconverter = TfidfVectorizer(max_features=500, min_df=1, ngram_range=(1, 2), lowercase=True)  
tfidf = tfidfconverter.fit(documents)

X = tfidf.transform(documents).toarray()  

In [None]:
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)  

In [None]:
scores_by_metric = {}
scores = {}
for p in [5, 10, 20, 50, 75, 125, 250, 500, 1000]:
    clf = AdaBoostClassifier(n_estimators=p)
    score = cross_val_score(clf, X, y, scoring="accuracy", cv=4)
    scores[p] = score.mean()
    
scores_by_metric[f"AdaBoost"] = scores;
    
    
df = pd.DataFrame(data=scores_by_metric)
df.index.name = "ada"
df.reset_index(level=0, inplace=True)
print(f"metric = asdfasdf")
display(df)

In [None]:
from sklearn import naive_bayes
gnb = naive_bayes.BernoulliNB()
clf = gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores = cross_val_score(clf, X, y, cv=4)
display(scores.mean())

In [None]:
from sklearn import tree

best = 0.0
best_clf = None
for i in range(0, 300):
    clf = tree.DecisionTreeClassifier(criterion="entropy")
#     clf = clf.fit(X_train, y_train)
#     y_pred = clf.predict(X_test)
    scores = cross_val_score(clf, X, y, cv=4)
    if scores.mean() > best:
        best = scores.mean()
        best_clf = clf
#     scores_df["Decision tree"] = scores.mean()
display(best)

In [None]:
best_clf.fit(X_train, y_train)
y_pred = best_clf.predict(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
scores_by_metric = {}
for p in range(1, 5):
    scores = {}
    for k in range(1,21):
        clf = KNeighborsClassifier(n_neighbors=k, p=p)
        score = cross_val_score(clf, X, y, scoring="accuracy", cv=4)
        scores[k] = score.mean()
    scores_by_metric[f"minkowski-{p}"] = scores;
    

scores = {}
for k in range(1,21):
    clf = KNeighborsClassifier(n_neighbors=k, metric='chebyshev')
    score = cross_val_score(clf, X, y, scoring="accuracy", cv=4)
    scores[k] = score.mean()
scores_by_metric[f"chebyshev"] = scores;
    
df = pd.DataFrame(data=scores_by_metric)
df.index.name = "n_neighbors"
df.reset_index(level=0, inplace=True)
print(f"metric = asdfasdf")
display(df)

In [None]:
from sklearn import svm
clf = svm.SVC(gamma='scale')
clf.fit(X_train, y_train) 
scores = cross_val_score(clf, X, y, cv=4)
y_pred = clf.predict(X_test)
display(scores.mean())

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=4, p=2, algorithm="brute")
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
scores = cross_val_score(clf, X, y, scoring="accuracy", cv=4)
display(scores.mean())

In [None]:

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd 

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

conf_matrix_df = pd.DataFrame(confusion_matrix(y_test,y_pred), \
                                index= [key for key in ("any", "female")], \
                                columns=[key for key in ("any", "female")])
sns.heatmap(conf_matrix_df, annot=True)
plt.show()

In [None]:
with open('classifier_gender.pickle', 'wb') as picklefile:  
    pickle.dump(clf,picklefile)

In [None]:
pickle.dump(tfidf, open("tfidf_gender.pickle", "wb"))

In [None]:
sns.heatmap(conf_matrix_df, annot=True).get_figure().savefig("gender-knn.png")