### Libraries

In [1]:
import pandas as pd
import re, itertools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import preprocessor as p
from nltk.stem import WordNetLemmatizer
import little_mallet_wrapper
from nltk.tokenize import TweetTokenizer 
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from nltk.classify import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
pd.options.display.max_colwidth = 100

### Dataset

In [2]:
df = pd.read_csv("full-corpus-training.csv")

# Filter out records with the "irrelevant" label
# df = df[df['Sentiment'] != 'irrelevant']

df.head()

Unnamed: 0,Sentiment,TweetId,TweetText
0,positive,1.26e+17,Now all @Apple has to do is get swype on the iphone and it will be crack. Iphone that is
1,positive,1.26e+17,@Apple will be adding more carrier support to the iPhone 4S (just announced)
2,positive,1.26e+17,Hilarious @youtube video - guy does a duet with @apple 's Siri. Pretty much sums up the love aff...
3,positive,1.26e+17,@RIM you made it too easy for me to switch to @Apple iPhone. See ya!
4,positive,1.26e+17,I just realized that the reason I got into twitter was ios5 thanks @apple


### Missing Values

In [3]:
df.isna().sum()

Sentiment    0
TweetId      0
TweetText    0
dtype: int64

### Preprocess and cleaning the text to remove any stop words or punctuations

In [4]:
def clean_text_data(text):
    # preprocessor
    text = p.clean(text)

    # Remove HTML tags and URLs
    text = re.sub(r'<[^>]+>|http[s]?://\S+|http\S+|www\S+|https\S+', '', text)
    
    # Remove punctuation and replace words with multiple consecutive letters
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'(\w)\1{2,}', r'\1', text)
    
    # Insert a space before all capital letters in the middle of a sentence
    text = re.sub(r"(\w)([A-Z])", r"\1 \2", text)

    # Tokenize the tweet using TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    word_tokens = tokenizer.tokenize(text)

    # Stop word removal and length filtering
    stop_words = set(stopwords.words('english'))
    stop_words.remove('not')
    filtered_text = [word for word in word_tokens if word.isalnum() and len(word) > 3 and word.lower() not in stop_words]

    # Lowercase change
    text = ' '.join(filtered_text).lower()

    # Lemmatization using WordNet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokenize(text)]
    text = ' '.join(lemmatized_words)

    return text

# Apply the cleaning function to the 'TweetText' column
df['CleanedTweet'] = df['TweetText'].apply(lambda x: little_mallet_wrapper.process_string(x, numbers='remove'))
df['CleanedTweet'] = df['CleanedTweet'].apply(clean_text_data)

# Display the cleaned DataFrame
print(df['CleanedTweet'].head(10))


0                                                          apple swype iphone crack iphone
1                                            apple adding carrier support iphone announced
2         hilarious youtube video duet apple siri pretty much sum love affair http exbnqjy
3                                                            made easy switch apple iphone
4                                                     realized reason twitter thanks apple
5                    current blackberry user little disappointed move android apple iphone
6    strangest thing siri said glad apple gave siri sense humor http cotwaeudbp happyplace
7                                    great close personal event apple tonight regent store
8                              company experience best customer service aside zappos apple
9                                                                    apply apple hope call
Name: CleanedTweet, dtype: object


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer


# TF IDF vectorizer with adjusted parameters
tfidf_vect = TfidfVectorizer()

# Fit and transform
matrix_tfidf = tfidf_vect.fit_transform(df['CleanedTweet'])

# Using get_feature_names_out
featureNames = tfidf_vect.get_feature_names_out()

# Data frame for our matrix_tfidf and featureNames
df_tfidf = pd.DataFrame(data=matrix_tfidf.toarray(), columns=featureNames)

# Adding up the importance scores (= TF-IDF scores) for every word.
wordScores = df_tfidf.sum(axis=0)

# Sorting words according to how much they matter in all the tweets
# Sorting them with their overall TF-IDF scores.
top20words = wordScores.sort_values(ascending=False).head(20)

# Print top20words
print(top20words)


twitter             154.058752
google              132.880067
http                132.453139
apple               118.209687
microsoft           112.318714
android              90.867818
iphone               51.194151
nexus                48.710277
samsung              45.992413
sandwich             43.026021
cream                42.603218
phone                40.175494
galaxy               36.254216
window               31.573166
like                 29.438681
siri                 28.974785
facebook             28.959853
ballmer              24.940511
icecreamsandwich     24.068868
steve                22.565562
dtype: float64


In [6]:
# Visualization of top words
# plt.figure(figsize=(12, 6))
# top20words.plot(kind='bar')
# plt.title('Top 50 Words and Their TF-IDF Scores')
# plt.xlabel('Words')
# plt.ylabel('TF-IDF Score')
# plt.show()



### Train Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df['CleanedTweet'], df['Sentiment'], test_size=0.2, random_state=42)

### Vectorize the data

In [10]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

### Using Naive Bayes MultinomialNB to train our data

In [11]:
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

### Prediction and classification report

In [12]:
y_pred = classifier.predict(X_test_vectorized)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

  irrelevant       0.96      0.64      0.77       482
    negative       1.00      0.01      0.03       138
     neutral       0.63      0.99      0.77       680
    positive       1.00      0.00      0.00        95

    accuracy                           0.71      1395
   macro avg       0.90      0.41      0.39      1395
weighted avg       0.81      0.71      0.64      1395



### Using other models to train our data

In [None]:
# Text Classification-Methods
# Naïve Bayes
# Voted Perceptron
# Support Vector Machines
# Decision Trees
# K-nearest neighbor
# Rocchio’s algorithm
# Neural Networks

# Predictive Performance Evaluation 
# Classification accuracy on a dataset
# Recall
# Precision
# F-measure
# Accuracy
# Error and ...
