In [None]:
# Daniel Hu
# Categorizing sentiment in text using amazon reviews

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from joblib import dump
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
import re

# Load dataframes for training and testing
train_df = pd.read_csv("C:/SentimentAnalyzerApp/data/raw/emotion_data_3.csv", names = ['polarity','review_title','review_body']).dropna()
test_df =  pd.read_csv("C:/SentimentAnalyzerApp/data/raw/emotion_data_3_test.csv", names = ['polarity','review_title','review_body']).dropna()

# Keep sentiment and text, drop the rest.
train_df.drop(['review_title'], axis = 1)
test_df.drop(['review_title'], axis = 1)

# Set axes
x_train = train_df.review_body
y_train = train_df.polarity

x_test = test_df.review_body
y_test = test_df.polarity

# Replace numerical polarity values with 'positive' or 'negative'
y_train = y_train.replace({1:'negative', 2:'positive'})
y_test = y_test.replace({1:'negative', 2:'positive'})

# Pipeline with punctuation and capitalization
# Bag of words appproach with n-gram vocabulary
pipe = Pipeline([
  ('vec', CountVectorizer(stop_words='english', min_df=1000, analyzer = 'word', ngram_range=(1,2))),
  ('tfid', TfidfTransformer()),
  ('norm', Normalizer()),
  ('lr', SGDClassifier(loss='log_loss'))
])

# Train
model = pipe.fit(x_train, y_train)

dump(model, 'C:/SentimentAnalyzerApp/models/model1.joblib')

KeyboardInterrupt: 