### In this file we are going to train our four models with different combinations of preprocessing methods and vectorization methods for each model, so we can find out which combination of preprocessing and vectorization methods suits better for every type of the models.

In [1]:
### imports 
import pickle

import pandas as pd
import gensim.downloader as api

from fasttext import FastText
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from config import path_to_data_folder, path_to_fast_text_model, glove_path

In [2]:
### LOAD THE MODELS
word2vec_model = api.load("word2vec-google-news-300") # model trained on lower case words, use lower case tokens
# fast_model = FastText.load_model(path_to_fast_text_model)

# with open(glove_path, 'rb') as file:
#     glove_300d = pickle.load(file)

In [3]:
# make neccesary imports for preprocessing and vectorizatio
from functions_preprocessing import TextPreprocessor_flow_1, TextPreprocessor_flow_2, TextPreprocessor_flow_3, TextPreprocessor_flow_4, TextPreprocessor_flow_5, TextPreprocessor_flow_6, TextPreprocessor_flow_7, TextPreprocessor_flow_8, TextPreprocessor_flow_9, TextPreprocessor_flow_10, TextPreprocessor_flow_11, TextPreprocessor_flow_12, TextPreprocessor_flow_13, TextPreprocessor_flow_14, TextPreprocessor_flow_15, TextPreprocessor_flow_16, TextPreprocessor_flow_17, TextPreprocessor_flow_18, TextPreprocessor_flow_19, TextPreprocessor_flow_20
from functions_vectorization import TfidfVectorizer, CountVectorizer, Word2VecVectorizer, FastTextVectorizer, GloveVectorizer

list_of_preprocessors = [TextPreprocessor_flow_1, TextPreprocessor_flow_2, TextPreprocessor_flow_3, TextPreprocessor_flow_4, TextPreprocessor_flow_5, TextPreprocessor_flow_6, TextPreprocessor_flow_7, TextPreprocessor_flow_8, TextPreprocessor_flow_9, TextPreprocessor_flow_10, TextPreprocessor_flow_11, TextPreprocessor_flow_12, TextPreprocessor_flow_13, TextPreprocessor_flow_14, TextPreprocessor_flow_15, TextPreprocessor_flow_16, TextPreprocessor_flow_17, TextPreprocessor_flow_18, TextPreprocessor_flow_19, TextPreprocessor_flow_20]
list_of_vectorizers = [TfidfVectorizer, CountVectorizer, Word2VecVectorizer(word2vec_model)]
list_of_models = [LogisticRegression, RandomForestClassifier]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alexraudvee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/alexraudvee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/alexraudvee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexraudvee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Create different pipelines for future model training, testing, evaluating

In [35]:
# PIPELINES FOR LOGISTICREGRESSION
created_pipelines = {}
for model in list_of_models:
    for vectorizer in list_of_vectorizers:
        for preprocessing_flow in list_of_preprocessors:

            pipeline_name = f"pipeline_{preprocessing_flow.__name__}_{vectorizer.__name__}_{model.__name__}"

            pipeline = Pipeline([
                                ('preprocess', preprocessing_flow),
                                ('vectorizer', vectorizer),
                                ('model', model)
                                ])

            created_pipelines[pipeline_name] = pipeline
    

In [36]:
print(created_pipelines)

{'pipeline_TextPreprocessor_flow_1_TfidfVectorizer_LogisticRegression': Pipeline(steps=[('preprocess',
                 <class 'functions_preprocessing.TextPreprocessor_flow_1'>),
                ('vectorizer',
                 <class 'sklearn.feature_extraction.text.TfidfVectorizer'>),
                ('model',
                 <class 'sklearn.linear_model._logistic.LogisticRegression'>)]), 'pipeline_TextPreprocessor_flow_2_TfidfVectorizer_LogisticRegression': Pipeline(steps=[('preprocess',
                 <class 'functions_preprocessing.TextPreprocessor_flow_2'>),
                ('vectorizer',
                 <class 'sklearn.feature_extraction.text.TfidfVectorizer'>),
                ('model',
                 <class 'sklearn.linear_model._logistic.LogisticRegression'>)]), 'pipeline_TextPreprocessor_flow_3_TfidfVectorizer_LogisticRegression': Pipeline(steps=[('preprocess',
                 <class 'functions_preprocessing.TextPreprocessor_flow_3'>),
                ('vectorizer',
 

In [28]:
### PREPARE THE DATA FOR FITTING THE FUTURE MODELS
# Reading datasets
gender_df = pd.read_csv(f'{path_to_data_folder}/gender.csv')

jud_per_df = pd.read_csv(f'{path_to_data_folder}/judging_perceiving.csv')

political_df  = pd.read_csv(f'{path_to_data_folder}/political_leaning.csv')

# Split the dataset into training and testing sets
X = gender_df['post'].tolist()  # Replace 'text_column' with the column containing text data
y = gender_df['female'].tolist()  # Replace 'target_column' with the column containing target labels

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [29]:
len(X_train), len(y_train), len(X_test), len(y_test)

(35708, 35708, 8927, 8927)

In [39]:
created_pipelines['pipeline_TextPreprocessor_flow_10_TfidfVectorizer_LogisticRegression'].fit(X_train, y_train)

scores = {}

# Future script
for pipeline_name in created_pipelines:
    created_pipelines[pipeline_name].fit(X_train, y_train)
    scores[pipeline_name] = created_pipelines[pipeline_name].score(X_test, y_test)

AttributeError: 'list' object has no attribute 'fit'