---
# Preprocessing
---

### Load libraries and data

In [1]:
import pandas as pd
from pathlib import Path
import spacy 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
DATA_PATH="../data/raw/aclImdb_v1/aclImdb/train/"
PROCESSED_DATA_PATH="../data/processed/"
RANDOM_SEED=42

In [3]:
columns=["Review", "Label"]
df = pd.DataFrame(columns=columns)

Add positive reviews

In [4]:
p = Path(DATA_PATH ) / 'pos'
for file_path in p.iterdir():
    with file_path.open('r') as file:
        new_row={'Review':file.read(), 'Label':1}
        df.loc[len(df)]=new_row

Add negative reviews

In [5]:
p = Path(DATA_PATH) / 'neg'
for file_path in p.iterdir():
    with file_path.open('r') as file:
        new_row={'Review':file.read(), 'Label':0}
        df.loc[len(df)]=new_row

In [6]:
df.head(5)

Unnamed: 0,Review,Label
0,I came in in the middle of this film so I had ...,1
1,Clint Eastwood reprises his role as Dirty Harr...,1
2,Okay this movie fine like I said but you surel...,1
3,When a small hobbit named Frodo Baggins inheri...,1
4,This film was made in 1943 when i think Judy w...,1


### Preprocessing data

lowercase text

In [7]:
def lower_case(text):
    return text.lower()

df['Review'] = df['Review'].apply(lower_case)

In [8]:
df.head()

Unnamed: 0,Review,Label
0,i came in in the middle of this film so i had ...,1
1,clint eastwood reprises his role as dirty harr...,1
2,okay this movie fine like i said but you surel...,1
3,when a small hobbit named frodo baggins inheri...,1
4,this film was made in 1943 when i think judy w...,1


In [9]:
shuffled_df = df.sample(frac=1).reset_index(drop=True)
shuffled_df.head()

Unnamed: 0,Review,Label
0,"enormous fun for both adults and children, thi...",1
1,this film is a complete re-imagining of romeo ...,1
2,this is a really cool movie! i remember first ...,1
3,this truly funny movie has a zany cast of char...,1
4,don't really know where to start with one of t...,0


In [10]:
shuffled_df.to_csv(PROCESSED_DATA_PATH+'df.csv')

Tokenization and lemmatization

In [11]:
nlp=spacy.load('en_core_web_sm')

In [12]:
def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not any(char.isdigit() for char in token.text)]
    return ' '.join(token for token in tokens)

# Apply tokenization to each row in the DataFrame
tokenized_df=pd.DataFrame()
tokenized_df['Label']=df['Label']
tokenized_df['tokens'] = df['Review'].apply(tokenize_text)
#df['tokens'] = df['Review'].apply(tokenize_text)

KeyboardInterrupt: 

In [None]:
tokenized_df.head()

Unnamed: 0,Review,Label,tokens
0,i came in in the middle of this film so i had ...,1,come middle film idea credit title till look r...
1,clint eastwood reprises his role as dirty harr...,1,clint eastwood reprise role dirty harry time c...
2,okay this movie fine like i said but you surel...,1,okay movie fine like say surely need watch wor...
3,when a small hobbit named frodo baggins inheri...,1,small hobbit name frodo baggin inherit magic r...
4,this film was made in 1943 when i think judy w...,1,film think judy peak look wise previous film g...


### Vectorization

Bag of Words

In [None]:
# Bag of Words
count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(df['tokens'])
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())
bow_df["Label"]=df["Label"]

In [None]:
bow_df.to_csv(PROCESSED_DATA_PATH + '/bow_df.csv', index=False)

TF-IDF

In [None]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['tokens'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df["Label"]=df["Label"]
print("TF-IDF representation:\n", tfidf_df)

TF-IDF representation:
        _________  _____________________________________  \
0            0.0                                    0.0   
1            0.0                                    0.0   
2            0.0                                    0.0   
3            0.0                                    0.0   
4            0.0                                    0.0   
...          ...                                    ...   
12495        0.0                                    0.0   
12496        0.0                                    0.0   
12497        0.0                                    0.0   
12498        0.0                                    0.0   
12499        0.0                                    0.0   

       __________________________________________________________________  \
0                                                    0.0                    
1                                                    0.0                    
2                                   

In [None]:
tfidf_df.to_csv(PROCESSED_DATA_PATH + 'tfidf_df.csv', index=False)