## Tweets - Sentiment Analysis

### Loading and Exploring Data
The dataset was downloaded from [Kaggle](https://www.kaggle.com/kazanova/sentiment140).

In [22]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [23]:
df = pd.read_csv('Sentiment140.csv',encoding='latin-1')
sample = df.sample(n=10000, axis=0)

In [24]:
sample.columns =['target','id','date','flag','user','text']
sample.head()

Unnamed: 0,target,id,date,flag,user,text
1131005,4,1975786807,Sat May 30 15:54:07 PDT 2009,NO_QUERY,veronicalynn,@kimberliea fail I didn't even see him while ...
922138,4,1754352859,Sun May 10 05:58:26 PDT 2009,NO_QUERY,SharonDV,@Journeywoman Thank you M.E.
64766,0,1688928213,Sun May 03 12:14:08 PDT 2009,NO_QUERY,Emmetts,"Ahw, Wipeout-Zacharias' dialect made me want t..."
1577166,4,2189644041,Tue Jun 16 00:32:07 PDT 2009,NO_QUERY,Ellen_Stafford,@OfficialVernonK Don't do it
160252,0,1957021224,Thu May 28 23:17:56 PDT 2009,NO_QUERY,la_oooo_ra,@mysticnz no im not cries LOL


### Preparing data

Processing urls and punctuation marks, cleaning spaces, tokenize, lemmatize and stopwords control

In [27]:
emoji_dict = {
    ":)": "happy",
    ":(": "sad",
    ":D": "laugh",
    ":'(": "cry",
    ":P": "playful",
    ";)": "wink",
    ":-/": "skeptical",
    ":-|": "neutral",
    "<3": "love"
    # Add more mappings as needed
}

def emoji_to_text(s, emoji_dict):
    for emoji, word in emoji_dict.items():
        s = s.replace(emoji, ' ' + word + ' ')
    return s

def pre_processing(s):
    # Convert emojis to text
    s = emoji_to_text(s, emoji_dict)
    
    # Handle negations
    negation_words = ['not', 'no', 'never', 'none', 'nobody', 'nothing', 'neither', 'nowhere', 'hardly', 'scarcely', 'barely', 'doesn’t', 'isn’t', 'wasn’t', 'shouldn’t', 'wouldn’t', 'couldn’t', 'won’t', 'can’t', 'don’t']
    for negation in negation_words:
        s = re.sub(r'\b({})\b[\s]?([a-z]+)'.format(negation), r'\1_\2', s)
    
    # Remove URLs
    s = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', ' ', s)
    
    # Replace various punctuation marks with spaces
    s = s.translate({ord(c): " " for c in "!@#$%^&*()[]{}'\";:,./<>?\|`-~=_+"})
    
    # Remove digits
    s = re.sub('\d+', ' ', s)
    
    # Convert to lowercase
    s = s.lower()
    
    # Tokenize
    tokens = word_tokenize(s)
    
    # Lemmatize
    lm = WordNetLemmatizer()
    lemmatized = [lm.lemmatize(word) for word in tokens]
    
    # Remove stopwords
    l = [word for word in lemmatized if not word in stopwords.words('english')]
    
    return l

# Apply pre-processing to the text
sample["text_processed"] = sample["text"].apply(pre_processing)

### Creating Bag of Words

1. Calculate the frequency distribution of all words
2. Then select the top 5,000 words from the frequency distribution.

In [28]:
bow = [word for sublist in sample["text_processed"] for word in sublist]

### Building Features

Create a 2-dimensional matrix to record whether each of those words is contained in each tweet.

In [29]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
X_tfidf = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in sample["text_processed"]])
y = sample['target']

In [30]:
X_tfidf

<10000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 67265 stored elements in Compressed Sparse Row format>

In [31]:
#OLD APPROACH

# from nltk.probability import FreqDist
# list_of_words = sample["text_processed"].tolist()
# list_of_words

# bow = []
# for lists in list_of_words:
#     for word in lists:
#         bow.append(word)

# freq_dist = FreqDist(bow)
# top_5000 = freq_dist.most_common(5000)
# word_features, freq = [[x for x,y in top_5000],[y for x,y in top_5000]]
# def find_features(lst, bow):
#     word_features=list(bow)
#     words = set(lst)
#     features = {}
#     for w in word_features:
#         features[w] = (w in words)
#     return features
# featuresets = [(find_features(rev, word_features), category) for (rev, category) in features]
# features=[]
# for i,l in enumerate(sample["text_processed"]):
#     s=[find_features(l,bow),sample["target"].iloc[i]]
#     z=tuple(s)
#     features.append(z)
# len(features)
# train_set, test_set = features[5000:], features[:5000]

### Naive Bayes Model

Build and train a  Bayes classifier

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.5, random_state=42)

In [41]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier2.predict(X_test)

In [43]:
import numpy as np

# Get the feature names from the vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Get the log probability of each feature given a class
feature_log_probs = classifier.feature_log_prob_

# For each class, find the indices of the features with the highest log probabilities
most_informative_features_per_class = np.argsort(feature_log_probs, axis=1)[:, -10:]  # Get top 10 features for each class

# Display the most informative features for each class
for index, class_features in enumerate(most_informative_features_per_class):
    print(f"Class {index}:")
    for feature_index in class_features:
        print(f"  {feature_names[feature_index]}: {np.exp(feature_log_probs[index, feature_index])}")
    print()


Class 0:
  back: 0.0025879912058247674
  today: 0.0026474110657255818
  sad: 0.0026963151574641354
  know: 0.002731645401005582
  go: 0.00276979828664277
  work: 0.002965482338330415
  get: 0.0030073246776280327
  wa: 0.003124331834415638
  miss: 0.003207866887344859
  day: 0.0033570211690197374

Class 1:
  new: 0.00241343360646553
  like: 0.0025496777149691353
  wa: 0.00257810698863464
  lol: 0.00268083096377787
  going: 0.0027181794239409516
  day: 0.002960784211495757
  quot: 0.0032696098595671953
  thanks: 0.0037189461493811254
  love: 0.004400246122089488
  good: 0.004515414053264746



### Testing Naive Bayes Model

*OK = accuracy score is over 0.6. <br>
Good = accuracy score is over 0.7*

In [44]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7068


## Machine Learning Pipeline 

Putting all together 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin

# Custom transformer to apply the pre-processing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, emoji_dict):
        self.emoji_dict = emoji_dict
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return [self.pre_processing(text) for text in X]
    
    def pre_processing(self, s):
        # Convert emojis to text
        s = emoji_to_text(s, self.emoji_dict)
        # ... (rest of your pre-processing code)
        return ' '.join(l)  # Return the processed text as a single string

# Define the emoji dictionary
emoji_dict = {
    ":)": "happy",
    # ... (rest of your emoji mappings)
}

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor(emoji_dict)),
    ('vectorizer', TfidfVectorizer(max_features=5000, ngram_range=(1, 3))),
    ('classifier', MultinomialNB())
])

# Load the dataset
df = pd.read_csv('Sentiment140.csv', encoding='latin-1')
sample = df.sample(n=10000, axis=0)
sample.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sample['text'], sample['target'], test_size=0.5, random_state=42)

# Train the pipeline
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
