# Machine learning mini-project 2
### Importing the libraries and loading the dataset

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler



import nltk
import ssl
import string

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop = stopwords.words("english")

In [4]:
df = pd.read_table("Sentiment140.tenPercent.sample.tweets.tsv")
df.head()

Unnamed: 0,sentiment_label,tweet_text
0,4,"@elephantbird Hey dear, Happy Friday to You A..."
1,4,Ughhh layin downnnn Waiting for zeina to co...
2,0,"@greeniebach I reckon he'll play, even if he's..."
3,0,@vaLewee I know! Saw it on the news!
4,0,very sad that http://www.fabchannel.com/ has c...


## Pre-processing the data

In [5]:
# Convert the text to lowercase
df['tweet_text'] = df['tweet_text'].apply(lambda x: x.lower())

In [6]:
# Remove punctuations
def remove_punctuations(text):
    translator = text.maketrans('', '', string.punctuation)
    return text.translate(translator)

df["tweet_text"] = df["tweet_text"].apply(remove_punctuations)

In [7]:
# Remove stopwords
df["tweet_text"] = df["tweet_text"].replace(stop, "")

In [8]:
# Create a new dataframe
df_2 = df.copy()

## Logistic Regression, Machine Learning method 1


In [9]:
# Splitting into test and training data for method 1
X_train, X_test, y_train, y_test = train_test_split(df['tweet_text'], df['sentiment_label'], test_size=0.2, random_state=42)

In [10]:
# Vectorizing the data for method 1
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [11]:
# Tokenize the data to turn it into a list of words
df["tweet_text"] = df["tweet_text"].apply(word_tokenize)

In [12]:
#Stemming data for both methods

stemmer = PorterStemmer()
def apply_stemmer(text):
    return [stemmer.stem(word) for word in text]

df['tweet_text'] = df['tweet_text'].apply(apply_stemmer)

In [13]:
lr = LogisticRegression()
lr.fit(X_train_vec, y_train)
y_pred_lr = lr.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Accuracy: 0.77725
Confusion Matrix:
 [[12157  3845]
 [ 3283 12715]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.76      0.77     16002
           4       0.77      0.79      0.78     15998

    accuracy                           0.78     32000
   macro avg       0.78      0.78      0.78     32000
weighted avg       0.78      0.78      0.78     32000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Logistic Regression, Mahine learning method 2

In [23]:
# Splitting into test and training data for method 1
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(df_2['tweet_text'], df['sentiment_label'], test_size=0.2, random_state=42)

In [24]:
# Vectorizing the data for method 1
vectorizer_2 = CountVectorizer()
X_train_vec_2 = vectorizer_2.fit_transform(X_train)
X_test_vec_2 = vectorizer_2.transform(X_test)

In [25]:
# Scaling data to work better with the method.
sc = StandardScaler(with_mean=False)
X_train_vec_2 = sc.fit_transform(X_train_vec_2)
X_test_vec_2 = sc.transform(X_test_vec_2)

In [26]:
# Tokenize the data to turn it into a list of words
df_2["tweet_text"] = df_2["tweet_text"].apply(word_tokenize)

In [27]:
#Stemming data for method 2
df_2['tweet_text'] = df_2['tweet_text'].apply(apply_stemmer)

In [28]:
svc = SVC()
svc.fit(X_train_vec_2, y_train_2)
y_pred_svc_2 = svc.predict(X_test_vec_2)
print("Accuracy:", accuracy_score(y_test_2, y_pred_svc_2))
print("Confusion Matrix:\n", confusion_matrix(y_test_2, y_pred_svc_2))
print("Classification Report:\n", classification_report(y_test_2, y_pred_svc_2))