In [None]:
import string
import pickle
import pathlib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
def preprocessing(text):
    tokens = word_tokenize(text)

    tokens = [token for token in tokens if token not in string.punctuation]

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

In [None]:
BASE_DIR = pathlib.Path().resolve(strict=True).parent

CSV_DIR = BASE_DIR / 'csv'
CSV_FILE = CSV_DIR / 'dataset.csv'

MODEL_DIR = BASE_DIR / 'backend' / 'model'/ 'predict'
VECTOR_DIR = BASE_DIR / 'backend' / 'model'/ 'vectorizer'

MODEL_FILE = MODEL_DIR  / 'model.pkl'
VECTOR_FILE = VECTOR_DIR  / 'vectorizer.pkl'

MODEL_DIR.mkdir(parents=True, exist_ok=True)
VECTOR_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
df = pd.read_csv(CSV_FILE)
df.head()

In [None]:
df.loc[df['sentiment'] == 'positive', 'sentiment'] = 1
df.loc[df['sentiment'] == 'negative', 'sentiment'] = 0
print(df)

In [None]:
reviews = df['review']
processed_reviews = reviews.apply(lambda i: preprocessing(i))
df['processed'] = processed_reviews
df.head()

In [None]:
x = df['processed']
y = df['sentiment'].astype(int)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
x_train = [' '.join(review) for review in x_train]
x_test = [' '.join(review) for review in x_test]

In [None]:
vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [None]:
lr = LogisticRegression(
    solver='lbfgs', 
    max_iter=100000,
)
lr.fit(x_train, y_train)

In [None]:
y_pred = lr.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of model in percentage:", accuracy * 100)
print("Loss in percentage:", (1 - accuracy) * 100)

In [None]:
with open(MODEL_FILE, 'wb') as f:
    pickle.dump(lr, f)

In [None]:
with open(VECTOR_FILE, 'wb') as f:
    pickle.dump(vectorizer,f)