# Notebook for creating the TfIdf Embeddings of the Story feature

## Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/Project/KickLearning/

/content/drive/.shortcut-targets-by-id/1iWelwZQ6yutjZacqvhb1xSPIvi_nlIiH/Project/KickLearning


Lybraries

In [None]:
from os.path import join
from os import listdir

import pandas as pd
import numpy as np

from feature_analysis.text_embeding import TextEncoder

from time import time

In [None]:
data_path = join(".", "data", "texts_translated")
destination_path = join(".", "data", "texts_embedding")

## Pooilng all the data together
make one single dataset

In [None]:
df_complete = pd.concat([pd.read_csv(join(data_path, file_name), usecols=["id", "story"]) for file_name in listdir(data_path)])

In [None]:
df_sample = df_complete.sample(frac=0.5, random_state=1234)

In [None]:
del df_complete

## Embedding Data

In [None]:
encoder = TextEncoder(pca_var_explained=0.8)

In [None]:
start = time()
encoder.fit_pipeline(data=df_sample.story.replace(np.nan, '', regex=True).to_list())
print(f"fitting time for TfIdf {round((time()-start)//60)}:{round((time()-start)%60)}")

fitting time for TfIdf 13:28


In [None]:
del df_sample

Working on fingle file to avoid RAM overflow

In [None]:
final_df = pd.DataFrame()
for i, file_name in enumerate(listdir(data_path)):
    print(f" - starting with document {i}")
    start = time()

    df = pd.read_csv(join(data_path, file_name), usecols=["id", "story"])

    texts = df.story.replace(np.nan, '', regex=True).to_list()

    embeddings = encoder.transform_pipeline(data=texts)

    df.drop(columns="story", inplace=True)

    df = pd.concat([df["id"], pd.DataFrame(embeddings, index=df.index).add_prefix("text_")], axis=1)

    final_df = final_df.append(df)

    print(f"time for document {i} -> {round((time()-start)//60)}:{round((time()-start)%60)}\n")

 - starting with document 0
time for document 0 -> 1:28

 - starting with document 1
time for document 1 -> 1:31

 - starting with document 2
time for document 2 -> 1:32

 - starting with document 3
time for document 3 -> 1:26

 - starting with document 4
time for document 4 -> 1:29

 - starting with document 5
time for document 5 -> 1:28

 - starting with document 6
time for document 6 -> 1:27

 - starting with document 7
time for document 7 -> 1:26

 - starting with document 8
time for document 8 -> 1:26

 - starting with document 9
time for document 9 -> 1:27

 - starting with document 10
time for document 10 -> 1:23

 - starting with document 11
time for document 11 -> 1:25

 - starting with document 12
time for document 12 -> 1:37

 - starting with document 13
time for document 13 -> 1:2

 - starting with document 14
time for document 14 -> 1:38

 - starting with document 15
time for document 15 -> 1:34

 - starting with document 16
time for document 16 -> 1:26



## Saving Data

In [None]:
final_df.to_csv(join(destination_path, "complete_story_embedding.csv"), index=False)

## Saving model for interpretability

In [None]:
encoder.save_object(path=join(destination_path, "encoder_with_tfidf_and_pca.pkl"))

saved object in pikle file
