# Import and Read data

In [7]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os

import re

from pathlib import Path

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

ModuleNotFoundError: No module named 'google.colab'

Read data

In [9]:
main_dir = Path(os.path.abspath('')).parent
print(main_dir)

/Users/antongerasimov/Documents/EPAM/ds/project/final_project_epam_ds


In [11]:
main_dir = Path(os.path.abspath('')).parent
data_path = main_dir / 'data' / 'raw'
train_file_name = "train.csv"
test_file_name = "test.csv"

In [12]:
train_data = pd.read_csv(os.path.join(data_path, train_file_name))

In [13]:
test_data = pd.read_csv(os.path.join(data_path, test_file_name))

# Data preprocessing

## Theory methods



*   Removing URL
*   Removing all irrelevant characters (Numbers and Punctuation)
*   Convert all characters into lowercase
*   Removing Stopwords
*   Stemming and Lemmatization !!!
*   Remove the words having length <= 2 !!!
*   Remove most frequently used words in both positive and negative reviews
*   Remove whitespaces
*   Removal of Rare words
*   Conversion of Emoji to Words ???
*   Conversion of Emoticon to Words ???
*   Removal of HTML Tags
*   Chat Words Conversion ???
*   Spelling Correction ??? 10 rows evaluates for 1.2 minutes








Мб сначала токенизировать, а потом уже работать с текстом







## Methods

In [18]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/antongerasimov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
from nltk.corpus import stopwords

In [20]:
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [21]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/antongerasimov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:
def preprocess_data(column):
    column = remove_url(column)
    column = remove_html_tags(column)
    column = remove_non_alphanumeric(column)
    column = convert_to_lowercase(column)
    column = remove_short_words(column)
    column = remove_stopwords(column)
    return column

In [23]:
def remove_url(review_text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(url_pattern, '', review_text)

In [24]:
def remove_html_tags(review_text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', review_text)

In [25]:
def remove_non_alphanumeric(review_text):
    return re.sub(r'[^a-zA-Z]', ' ', review_text)

In [26]:
def convert_to_lowercase(review_text):
    review_text = review_text.lower()
    return review_text

In [27]:
def remove_short_words(review_text, min_length=3):
    return ' '.join(word for word in review_text.split() if len(word) > min_length)

In [28]:
def remove_stopwords(review_text):
    STOPWORDS = set(stopwords.words('english'))
    words = review_text.split()
    filtered_words = [word for word in words if word.lower() not in STOPWORDS]
    return ' '.join(filtered_words)

In [29]:
def correct_spellings(text):
    spell = SpellChecker()
    corrected_text = []
    for word in text.split():
        corrected_word = spell.correction(word)
        if corrected_word:
            corrected_text.append(corrected_word)
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [30]:
def find_common_words(data):
    positive_review = data[data.sentiment == 'positive']['review']
    negative_review = data[data.sentiment == 'negative']['review']
    splited_review = [positive_review, negative_review]
    all_text_sets = [' '.join(examples) for examples in splited_review]

    top_words_sets = []
  
    for item, text_set in enumerate(all_text_sets):
        top_words_series = pd.Series(text_set.split()).value_counts().head(13)
        top_words = top_words_series.index.tolist()
        top_word_counts = top_words_series.values.tolist()
        top_words_sets.append(set(top_words))

    return set.intersection(*top_words_sets)

In [31]:
def remove_common_words_from_review(review_text, common_words):
    words = review_text.split()
    filtered_words = [word for word in words if word.lower() not in common_words]  # Consider case-insensitive matching
    return ' '.join(filtered_words)

Скорее всего нужно удалять одни и те же слова, и брать их из трейн даты

In [84]:
def remove_common_words(current_data, common_words):
    current_data['review'] = current_data['review'].apply(lambda text: remove_common_words_from_review(text, common_words))
    return current_data

# Lematization vs Stematization

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Data preparation

In [None]:
data_for_test = train_data.copy()
data_for_test['review'] = data_for_test['review'].apply(preprocess_data)

stemmer

In [None]:
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

data_for_test['stemmed_review'] = data_for_test['review'].apply(lambda text: stem_words(text))

Unnamed: 0,review,sentiment,stemmed_review
0,caught little totally accident back revival th...,positive,caught littl total accid back reviv theatr sil...
1,believe movie accomplish favor friends early a...,negative,believ movi accomplish favor friend earli apri...
2,spoiler alert gets nerve people remake term lo...,negative,spoiler alert get nerv peopl remak term loos g...
3,thing learnt watching george romero creepshow ...,negative,thing learnt watch georg romero creepshow stum...
4,remember theaters reviews said horrible well t...,negative,rememb theater review said horribl well think ...


In [None]:
data_for_test['stemmed_review'].head()

0    caught littl total accid back reviv theatr sil...
1    believ movi accomplish favor friend earli apri...
2    spoiler alert get nerv peopl remak term loos g...
3    thing learnt watch georg romero creepshow stum...
4    rememb theater review said horribl well think ...
Name: stemmed_review, dtype: object

lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [None]:
data_for_test['lemmed_review'] = data_for_test['review'].apply(lambda text: lemmatize_words(text))
data_for_test['lemmed_review'].head()

0    caught little totally accident back revival th...
1    believe movie accomplish favor friend early ap...
2    spoiler alert get nerve people remake term loo...
3    thing learnt watching george romero creepshow ...
4    remember theater review said horrible well thi...
Name: lemmed_review, dtype: object