In [8]:
import pandas as pd
import re
from dataVisualisation import viewData, viewDistrib
from bs4 import BeautifulSoup
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
import spacy
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec as wv
from nltk.stem import PorterStemmer

In [9]:
df = pd.read_csv("archive/IMDB Dataset.csv")

In [10]:
df.review[0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [11]:
gensim.utils.simple_preprocess("One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare. Forget pretty pictures painted for mainstream audiences, forget charm, forget romance...OZ doesn't mess around. The first episode I ever saw struck me as so nasty it was surreal, I couldn't say I was ready for it, but as I watched more, I developed a taste for Oz, and got accustomed to the high levels of graphic violence. Not just violence, but injustice")

['one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching',
 'just',
 'oz',
 'episode',
 'you',
 'll',
 'be',
 'hooked',
 'they',
 'are',
 'right',
 'as',
 'this',
 'is',
 'exactly',
 'what',
 'happened',
 'with',
 'me',
 'br',
 'br',
 'the',
 'first',
 'thing',
 'that',
 'struck',
 'me',
 'about',
 'oz',
 'was',
 'its',
 'brutality',
 'and',
 'unflinching',
 'scenes',
 'of',
 'violence',
 'which',
 'set',
 'in',
 'right',
 'from',
 'the',
 'word',
 'go',
 'trust',
 'me',
 'this',
 'is',
 'not',
 'show',
 'for',
 'the',
 'faint',
 'hearted',
 'or',
 'timid',
 'this',
 'show',
 'pulls',
 'no',
 'punches',
 'with',
 'regards',
 'to',
 'drugs',
 'sex',
 'or',
 'violence',
 'its',
 'is',
 'hardcore',
 'in',
 'the',
 'classic',
 'use',
 'of',
 'the',
 'word',
 'br',
 'br',
 'it',
 'is',
 'called',
 'oz',
 'as',
 'that',
 'is',
 'the',
 'nickname',
 'given',
 'to',
 'the',
 'oswald',
 'maximum',
 'security',
 'state',
 'penitentary',
 'it',
 'focuses'

#### First attempt to clean the data set.

This version does not include the lemmatization and stemming.

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))

def fastDataCleaner(text):
    '''
        Function : clean the dataBase
            1. Remove HTML tags
            2. Tokenizer + cleaning with gensim
            3. Remove stopwords
    '''
    
    # 1. Supprimer balises HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # 2. Tokenizer + nettoyage avec gensim
    tokens = simple_preprocess(text)  # deacc=True supprime ponctuation

    # 3. Supprimer stopwords
    tokens = [tok for tok in tokens if tok not in stop_words]

    return tokens

# Enregistre le df dans un fichier csv
def saveDF(df, number):
    df.to_csv(f"clean_IMDB{number}.csv", index=False)
    df.to_pickle(f"clean_IMDB{number}.pkl")

def saveModel(model,name):
    model.save(f"{name}.model")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\djibr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
import numpy as np
print(spacy.__version__)
print(np.__version__)
print(gensim.__version__)

3.8.2
1.26.4
4.3.2


import numpy as np
print(spacy.__version__)
print(np.__version__)
print(gensim.__version__)
3.8.2
1.26.4
4.3.2

#### Training used to clean our DataBase

In [None]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    '''
        Function : clean the dataBase
            1. Remove HTML, mentions, hashtags, special characters
            2. Tokenizer (gensim)
            3. Remove stopwords
            4. Lemmatisation (spacy)
            5. Stemming
    '''
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"@\w+|#\w+|[^a-zA-Z ]", " ", text)

    tokens = simple_preprocess(text, deacc=True)

    tokens = [tok for tok in tokens if tok not in stop_words]

    doc = nlp(" ".join(tokens))
    lemmas = [token.lemma_ for token in doc if token.is_alpha]

    stems = [stemmer.stem(lemma) for lemma in lemmas]

    return stems

In [15]:
cdf = pd.DataFrame({
    "clean_review": df["review"].apply(preprocess),
    "sentiment": df["sentiment"]
})

In [None]:
def saveDF(df, number):
    '''
        Function : Save the DataFrame into a CSV file or a PKL file.
    '''
    df.to_csv(f"clean_IMDB{number}.csv", index=False)
    df.to_pickle(f"clean_IMDB{number}.pkl")

def saveModel(model,name):
    '''
        Function : Save the Word2Vec model into a .model file. 
    '''
    model.save(f"{name}.model")

In [20]:
saveDF(cdf, 3)

In [21]:
clean_reviews = cdf.clean_review

print(clean_reviews == cdf.clean_review)

0        True
1        True
2        True
3        True
4        True
         ... 
49995    True
49996    True
49997    True
49998    True
49999    True
Name: clean_review, Length: 50000, dtype: bool


In [18]:
type(clean_reviews)

pandas.core.series.Series

In [22]:
model = gensim.models.Word2Vec(
    window = 10,
    min_count = 2,
    workers=4
)   

In [23]:
model.build_vocab(clean_reviews, progress_per=1000)

In [24]:
model.epochs

5

In [25]:
model.corpus_count

50000

In [26]:
model.train(clean_reviews, total_examples=model.corpus_count, epochs=model.epochs)

(27250435, 29469920)

In [27]:
saveModel(model, "modelWV4")

In [28]:
model.wv.most_similar("bad")

[('worst', 0.7958782911300659),
 ('terribl', 0.7851450443267822),
 ('aw', 0.7703185081481934),
 ('horribl', 0.7569575905799866),
 ('lousi', 0.7216058373451233),
 ('wors', 0.7084195017814636),
 ('crappi', 0.6901121139526367),
 ('suck', 0.6686270833015442),
 ('atroci', 0.6559823751449585),
 ('lame', 0.6535003781318665)]

In [30]:
model2 = wv.load("modelW2V/testModelV2.model")

In [31]:
model2.wv.most_similar("bad")

[('awful', 0.7683809399604797),
 ('terrible', 0.7541758418083191),
 ('horrible', 0.7350753545761108),
 ('good', 0.727000892162323),
 ('lame', 0.682565450668335),
 ('stupid', 0.6769680976867676),
 ('lousy', 0.6632498502731323),
 ('crappy', 0.6457701921463013),
 ('cheesy', 0.6375147104263306),
 ('atrocious', 0.6278131008148193)]