### Libraries

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re

### Dataset

In [2]:
df = pd.read_csv("full-corpus-training.csv")
df.head()

Unnamed: 0,Sentiment,TweetId,TweetText
0,positive,1.26e+17,Now all @Apple has to do is get swype on the i...
1,positive,1.26e+17,@Apple will be adding more carrier support to ...
2,positive,1.26e+17,Hilarious @youtube video - guy does a duet wit...
3,positive,1.26e+17,@RIM you made it too easy for me to switch to ...
4,positive,1.26e+17,I just realized that the reason I got into twi...


### Missing Values

In [3]:
df.isna().sum()

Sentiment    0
TweetId      0
TweetText    0
dtype: int64

### Preprocess the text to remove any stop words or punctuations

In [7]:
def preprocess_tweet_text(tweet):
    # Remove URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", "", tweet, flags=re.MULTILINE)
    
    # Tokenize the tweet using TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tokens = tokenizer.tokenize(tweet)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Remove punctuations
    tokens = [token for token in tokens if token.isalnum()]
    
    # joinin the cleaned tokens together as a string
    processed_tweet = ' '.join(tokens)
    
    return processed_tweet

# Apply the cleaning function to the TweetText column
df['CleanedTweet'] = df['TweetText'].apply(preprocess_tweet_text)

In [6]:
# Display the cleaned DataFrame
print(df[['Sentiment', 'CleanedTweet']].head(10))

  Sentiment                                       CleanedTweet
0  positive                      get swype iphone crack iphone
1  positive         adding carrier support iphone 4s announced
2  positive  hilarious video guy duet siri pretty much sums...
3  positive                     made easy switch iphone see ya
4  positive           realized reason got twitter ios 5 thanks
5  positive          current user little bit disappointed move
6  positive  16 strangest things siri said far sooo glad ga...
7  positive  great close personal event tonight regent st s...
8  positive   companies experience best customer service aside
9  positive                            apply job hope call lol


### Top 50 Words

In [8]:
# TF IDF vectorizer
tfidf_vect = TfidfVectorizer()

# Fit and transform
matrix_tfidf = tfidf_vect.fit_transform(df['TweetText'])

# using get_feature_names_out 
featureNames = tfidf_vect.get_feature_names_out()

# Data frame for our matrix_tfidf and featureNames
df_tfidf = pd.DataFrame(data=matrix_tfidf.toarray(), columns=featureNames)

# Adding up the importance scores (= TF-IDF scores) for every word.
wordScores = df_tfidf.sum(axis=0)

# Sorting words according to how much they matter in all the tweets
# Sorting them with their overall TF-IDF scores.
top50words = wordScores.sort_values(ascending=False).head(50)

# print top50words
print(top50words)

twitter             179.514795
google              169.866767
microsoft           145.447217
apple               129.501805
android             107.612097
rt                   87.482604
nexus                59.686031
samsung              56.167924
new                  47.798535
galaxy               46.709653
sandwich             45.805192
en                   44.867273
cream                44.747461
ice                  44.448694
iphone               42.580752
que                  41.898920
el                   36.836136
ics                  35.784983
facebook             35.629049
windows              33.594248
phone                32.689083
40                   30.121356
la                   29.115044
like                 28.376681
siri                 28.374197
just                 28.152020
icecreamsandwich     27.402011
ballmer              26.559059
im                   24.735075
steve                23.292055
galaxynexus          22.646455
app                  22.072598
nexuspri