---
# Preprocessing
---

### Load libraries and data

In [13]:
import pandas as pd
from pathlib import Path
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, IDF, HashingTF
from pyspark.sql.functions import split


In [5]:
spark = SparkSession.builder \
    .appName("TextProcessing1") \
    .getOrCreate()

In [6]:
DATA_PATH="../data/raw/aclImdb_v1/aclImdb/train/"
PROCESSED_DATA_PATH="../data/processed/"
RANDOM_SEED=42

In [3]:
columns=["Review", "Label"]
df = pd.DataFrame(columns=columns)

Add positive reviews

In [4]:
p = Path(DATA_PATH ) / 'pos'
for file_path in p.iterdir():
    with file_path.open('r') as file:
        new_row={'Review':file.read(), 'Label':1}
        df.loc[len(df)]=new_row

Add negative reviews

In [5]:
p = Path(DATA_PATH) / 'neg'
for file_path in p.iterdir():
    with file_path.open('r') as file:
        new_row={'Review':file.read(), 'Label':0}
        df.loc[len(df)]=new_row

In [6]:
df.head(5)

Unnamed: 0,Review,Label
0,I came in in the middle of this film so I had ...,1
1,Clint Eastwood reprises his role as Dirty Harr...,1
2,Okay this movie fine like I said but you surel...,1
3,When a small hobbit named Frodo Baggins inheri...,1
4,This film was made in 1943 when i think Judy w...,1


### Preprocessing data

lowercase text

In [7]:
def lower_case(text):
    return text.lower()

df['Review'] = df['Review'].apply(lower_case)

In [8]:
df.head()

Unnamed: 0,Review,Label
0,i came in in the middle of this film so i had ...,1
1,clint eastwood reprises his role as dirty harr...,1
2,okay this movie fine like i said but you surel...,1
3,when a small hobbit named frodo baggins inheri...,1
4,this film was made in 1943 when i think judy w...,1


In [9]:
shuffled_df = df.sample(frac=1).reset_index(drop=True)
shuffled_df.head()

Unnamed: 0,Review,Label
0,i'm not even gonna waste time on this one; it'...,0
1,this movie is practically impossible to descri...,1
2,"when i started watching ""fay grim"", i had no i...",1
3,bo derek will not go down in history as a grea...,0
4,<br /><br />it's a generic coming-of-age story...,1


In [10]:
shuffled_df.to_csv(PROCESSED_DATA_PATH+'df.csv')

Tokenization and lemmatization

In [11]:
nlp=spacy.load('en_core_web_sm')

In [12]:
def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not any(char.isdigit() for char in token.text)]
    return ' '.join(token for token in tokens)

# Apply tokenization to each row in the DataFrame
tokenized_df=pd.DataFrame()
tokenized_df['Label']=df['Label']
tokenized_df['tokens'] = df['Review'].apply(tokenize_text)
#df['tokens'] = df['Review'].apply(tokenize_text)

In [13]:
tokenized_df.head()

Unnamed: 0,Label,tokens
0,1,come middle film idea credit title till look r...
1,1,clint eastwood reprise role dirty harry time c...
2,1,okay movie fine like say surely need watch wor...
3,1,small hobbit name frodo baggin inherit magic r...
4,1,film think judy peak look wise previous film g...


In [15]:
tokenized_df.to_csv(PROCESSED_DATA_PATH+"tekenized_df.csv")

In [7]:
tokenized_df=pd.read_csv(PROCESSED_DATA_PATH+"tekenized_df.csv")

### Vectorization

Bag of Words

In [10]:
spark_df = spark.createDataFrame(tokenized_df)
# Diviser la colonne "tokens" en une liste de mots
spark_df = spark_df.withColumn("tokens_array", split(spark_df["tokens"], " "))

# Initialiser le transformateur CountVectorizer
count_vectorizer = CountVectorizer(inputCol="tokens_array", outputCol="features")

# Appliquer le transformateur CountVectorizer sur les données
model = count_vectorizer.fit(spark_df)
bow = model.transform(spark_df)
Bow_df=result.select("tokens_array","features", "Label").toPandas()

In [12]:
Bow_df.to_csv(PROCESSED_DATA_PATH+"bow_df.csv")

TF-IDF

In [15]:
hashing_tf = HashingTF(inputCol="tokens_array", outputCol="raw_features")
featurized_data = hashing_tf.transform(spark_df)

# Utiliser IDF pour obtenir les vecteurs TF-IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(featurized_data)
rescaled_data = idf_model.transform(featurized_data)
TFIDF_df=rescaled_data.select("tokens_array","features", "Label").toPandas()

In [16]:
TFIDF_df.to_csv(PROCESSED_DATA_PATH+"TFIDF_df.csv")