### In this file we are going to train our four models with different combinations of preprocessing methods and vectorization methods for each model, so we can find out which combination of preprocessing and vectorization methods suits better for every type of the models.

In [9]:
### imports 
import csv

import pandas as pd
import gensim.downloader as api

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from config import path_to_data_folder
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from functions_vectorization import TfidfVectorizer, CountVectorizer, Word2VecVectorizer
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

In [3]:
### LOAD THE MODELS
word2vec_model = api.load("word2vec-google-news-300") # model trained on lower case words, use lower case tokens

#### Load, clean, split the data on which we are going to train the pipeline and evaluate
#### Create different pipelines for future model training, testing, evaluating

In [2]:
import gensim.downloader as api

word2vec_model = api.load("word2vec-google-news-300") # model trained on lower case words, use lower case tokens

In [1]:
list_of_preprocessed_data = ['gender_df_preprocessed_0', 'gender_df_preprocessed_1', "gender_df_preprocessed_2", 'gender_df_preprocessed_3', 'gender_df_preprocessed_4', 'gender_df_preprocessed_5', 'gender_df_preprocessed_6', 'gender_df_preprocessed_7', 'gender_df_preprocessed_9', 'gender_df_preprocessed_10', 'gender_df_preprocessed_11', 'gender_df_preprocessed_12', 'gender_df_preprocessed_13', 'gender_df_preprocessed_14', 'gender_df_preprocessed_15', 'gender_df_preprocessed_16', 'gender_df_preprocessed_17']
list_of_vectorizers = [TfidfVectorizer]
list_of_models = [RandomForestClassifier, LogisticRegression]


# PIPELINES COBINATION AND IT'S SCORES FOR GENDER DATA
df = pd.read_csv('scores.csv', header=None, names=['pipeline_name', 'pipeline_scores'])

# Extract the pipeline names
finished_ = df['pipeline_name'].tolist()

file_path = "scores.csv"
for model in list_of_models:
    for vectorizer in list_of_vectorizers:
        for preprocessed_data in list_of_preprocessed_data:

            pipeline_name = f"pipeline_{preprocessed_data}_{vectorizer.__name__}_{model.__name__}"

            if pipeline_name not in finished_:
                pipeline = Pipeline([
                                    ('vectorizer', vectorizer()),
                                    ('model', model())
                                    ])

                df = pd.read_json(f'{path_to_data_folder}/{preprocessed_data}.json')

                X = df[f'post'].tolist()
                y = df[f'female'].tolist()

                # Split the dataset into training and testing sets
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

                # fi the pipe
                pipeline.fit(X_train, y_train)

                # Predict on the test set
                y_pred = pipeline.predict(X_test)

                y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

                input_in_the_file = [pipeline_name, [f'Score: {pipeline.score(X_test, y_test)}', f'precision: {precision_score(y_test, y_pred)}', f'Recall: {recall_score(y_test, y_pred)}', f'ROC AUC: {roc_auc_score(y_test, y_pred_proba)}']]

                # Append new data to the CSV file
                with open(file_path, 'a', newline='') as csv_file:
                    csv_writer = csv.writer(csv_file)
                    csv_writer.writerow(input_in_the_file)

                print(f'{preprocessed_data} {vectorizer.__name__} {model.__name__} finished and stored')

                finished_.append(pipeline_name)
            else: 
                continue

gender_df_preprocessed_0 TfidfVectorizer RandomForestClassifier finished and stored
gender_df_preprocessed_1 TfidfVectorizer RandomForestClassifier finished and stored
gender_df_preprocessed_2 TfidfVectorizer RandomForestClassifier finished and stored
gender_df_preprocessed_3 TfidfVectorizer RandomForestClassifier finished and stored
gender_df_preprocessed_4 TfidfVectorizer RandomForestClassifier finished and stored
gender_df_preprocessed_5 TfidfVectorizer RandomForestClassifier finished and stored
gender_df_preprocessed_6 TfidfVectorizer RandomForestClassifier finished and stored
gender_df_preprocessed_7 TfidfVectorizer RandomForestClassifier finished and stored
gender_df_preprocessed_9 TfidfVectorizer RandomForestClassifier finished and stored
gender_df_preprocessed_10 TfidfVectorizer RandomForestClassifier finished and stored
gender_df_preprocessed_11 TfidfVectorizer RandomForestClassifier finished and stored
gender_df_preprocessed_12 TfidfVectorizer RandomForestClassifier finished 

In [None]:
list_of_preprocessed_data = ['gender_df_preprocessed_0', 'gender_df_preprocessed_1', "gender_df_preprocessed_2", 'gender_df_preprocessed_3', 'gender_df_preprocessed_4', 'gender_df_preprocessed_5', 'gender_df_preprocessed_6', 'gender_df_preprocessed_7', 'gender_df_preprocessed_9', 'gender_df_preprocessed_10', 'gender_df_preprocessed_11', 'gender_df_preprocessed_12', 'gender_df_preprocessed_13', 'gender_df_preprocessed_14', 'gender_df_preprocessed_15', 'gender_df_preprocessed_16', 'gender_df_preprocessed_17']
list_of_vectorizers = [CountVectorizer]
list_of_models = [RandomForestClassifier, LogisticRegression]


# PIPELINES COBINATION AND IT'S SCORES FOR GENDER DATA
df = pd.read_csv(r'scores.csv', names=['pipeline_name', 'pipeline_scores'])
finished_ = df['pipeline_name'].tolist()

file_path = r"scores.csv"

for model in list_of_models:
    for vectorizer in list_of_vectorizers:
        for preprocessed_data in list_of_preprocessed_data:

            pipeline_name = f"pipeline_{preprocessed_data}_{vectorizer.__name__}_{model.__name__}"

            if pipeline_name not in finished_:
                
                
                pipeline = Pipeline([
                                    ('vectorizer', vectorizer()),
                                    ('model', model())
                                    ])

                df = pd.read_json(rf'{path_to_data_folder}/{preprocessed_data}.json')

                X = df[f'post'].tolist()
                y = df[f'female'].tolist()

                # Split the dataset into training and testing sets
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

                # fit the pipe
                pipeline.fit(X_test, y_test)

                # Predict on the test set
                y_pred = pipeline.predict(X_test)

                y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

                input_in_the_file = [pipeline_name, [f'Score: {pipeline.score(X_test, y_test)}', f'precision: {precision_score(y_test, y_pred)}', f'Recall: {recall_score(y_test, y_pred)}', f'ROC AUC: {roc_auc_score(y_test, y_pred_proba)}']]

                # Append new data to the CSV file
                with open(file_path, 'a', newline='') as csv_file:
                    csv_writer = csv.writer(csv_file)
                    csv_writer.writerow(input_in_the_file)

                print(f'{preprocessed_data} {vectorizer.__name__} {model.__name__} finished and stored')

                finished_.append(pipeline_name)
            else: 
                continue

In [None]:
list_of_preprocessed_data = ['gender_df_preprocessed_0', 'gender_df_preprocessed_1', "gender_df_preprocessed_2", 'gender_df_preprocessed_3', 'gender_df_preprocessed_4', 'gender_df_preprocessed_5', 'gender_df_preprocessed_6', 'gender_df_preprocessed_7', 'gender_df_preprocessed_9', 'gender_df_preprocessed_10', 'gender_df_preprocessed_11', 'gender_df_preprocessed_12', 'gender_df_preprocessed_13', 'gender_df_preprocessed_14', 'gender_df_preprocessed_15', 'gender_df_preprocessed_16', 'gender_df_preprocessed_17']
list_of_vectorizers = [Word2VecVectorizer]
list_of_models = [RandomForestClassifier, LogisticRegression]


# PIPELINES COBINATION AND IT'S SCORES FOR GENDER DATA
df = pd.read_csv(r'scores.csv', names=['pipeline_name', 'pipeline_scores'])
finished_ = df['pipeline_name'].tolist()

file_path = r"scores.csv"

for model in list_of_models:
    for vectorizer in list_of_vectorizers:
        for preprocessed_data in list_of_preprocessed_data:

            pipeline_name = f"pipeline_{preprocessed_data}_{vectorizer.name}_{model.name}"

            if pipeline_name not in finished_:
                
                
                pipeline = Pipeline([
                                    ('vectorizer', vectorizer(word2vec_model)),
                                    ('model', model())
                                    ])

                df = pd.read_json(rf'{path_to_data_folder}/{preprocessed_data}.json')

                X = df[f'post'].tolist()
                y = df[f'female'].tolist()

                # Split the dataset into training and testing sets
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

                # fi the pipe
                pipeline.fit(X_train, y_train)

                # Predict on the test set
                y_pred = pipeline.predict(X_test)

                y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

                input_in_the_file = [pipeline_name, [f'Score: {pipeline.score(X_test, y_test)}', f'precision: {precision_score(y_test, y_pred)}', f'Recall: {recall_score(y_test, y_pred)}', f'ROC AUC: {roc_auc_score(y_test, y_pred_proba)}']]

                # Append new data to the CSV file
                with open(file_path, 'a', newline='') as csv_file:
                    csv_writer = csv.writer(csv_file)
                    csv_writer.writerow(input_in_the_file)

                print(f'{preprocessed_data} {vectorizer.name} {model.name} finished and stored')

                finished_.append(pipeline_name)
            else: 
                continue

In [None]:
list_of_preprocessed_data = ['gender_df_preprocessed_0', 'gender_df_preprocessed_1', "gender_df_preprocessed_2", 'gender_df_preprocessed_3', 'gender_df_preprocessed_4', 'gender_df_preprocessed_5', 'gender_df_preprocessed_6', 'gender_df_preprocessed_7', 'gender_df_preprocessed_9', 'gender_df_preprocessed_10', 'gender_df_preprocessed_11', 'gender_df_preprocessed_12', 'gender_df_preprocessed_13', 'gender_df_preprocessed_14', 'gender_df_preprocessed_15', 'gender_df_preprocessed_16', 'gender_df_preprocessed_17']
list_of_vectorizers = [TfidfVectorizer, CountVectorizer, Word2VecVectorizer]
list_of_models = [SVC]


# PIPELINES COBINATION AND IT'S SCORES FOR GENDER DATA
df = pd.read_csv(r'scores.csv', names=['pipeline_name', 'pipeline_scores'])
finished_ = df['pipeline_name'].tolist()

file_path = r"scores.csv"

for model in list_of_models:
    for vectorizer in list_of_vectorizers:
        for preprocessed_data in list_of_preprocessed_data:

            pipeline_name = f"pipeline_{preprocessed_data}_{vectorizer.name}_{model.name}"

            if pipeline_name not in finished_:
                
                if vectorizer.name == 'Word2VecVectorizer':
                        
                    pipeline = Pipeline([
                                        ('vectorizer', vectorizer(word2vec_model)),
                                        ('model', OneVsRestClassifier(model(probability=True)))
                                        ])
            
                else:
                    pipeline = Pipeline([
                                        ('vectorizer', vectorizer()),
                                        ('model',OneVsRestClassifier(model(probability=True)))
                                        ]) 
                df = pd.read_json(rf'C:/Users/marce/OneDrive/Desktop/lai_data/{preprocessed_data}.json')

                X = df[f'post'].tolist()
                y = df[f'female'].tolist()

                # Split the dataset into training and testing sets
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

                # fi the pipe
                pipeline.fit(X_train, y_train)

                # Predict on the test set
                y_pred = pipeline.predict(X_test)

                probabilities = pipeline.predict_proba(X_test)[:, 1]

                input_in_the_file = [pipeline_name, [f'Score: {pipeline.score(X_test, y_test)}', 
                                                     f'precision: {precision_score(y_test, y_pred, average="micro")}', 
                                                     f'Recall: {recall_score(y_test, y_pred, average="micro")}', 
                                                     f'ROC AUC: {roc_auc_score(y_test, probabilities)}']]

                # Append new data to the CSV file
                with open(file_path, 'a', newline='') as csv_file:
                    csv_writer = csv.writer(csv_file)
                    csv_writer.writerow(input_in_the_file)

                print(f'{preprocessed_data} {vectorizer.name} {model.name} finished and stored')

                finished_.append(pipeline_name)
            else: 
                continue