### Libraries

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
import string

### Dataset

In [3]:
df = pd.read_csv("corpus.csv")
df.head()

Unnamed: 0,TweetText
0,Now all @Apple has to do is get swype on the i...
1,@Apple will be adding more carrier support to ...
2,Hilarious @youtube video - guy does a duet wit...
3,@RIM you made it too easy for me to switch to ...
4,I just realized that the reason I got into twi...


### Missing Values

In [5]:
df.isna().sum()

TweetText    0
dtype: int64

### Cleaning data

In [7]:
# function for cleaning
def cleaning(txt):
    # lowercase
    txt = txt.lower()
    # punctuation removal
    txt = ''.join([char for char in txt if char not in string.punctuation])
    # stop words removal
    txt = ' '.join([word for word in txt.split() if word not in ENGLISH_STOP_WORDS])
    return txt

df['TweetText'] = df['TweetText'].apply(cleaning)
df.head()

Unnamed: 0,TweetText
0,apple swype iphone crack iphone
1,apple adding carrier support iphone 4s just an...
2,hilarious youtube video guy does duet apple s ...
3,rim easy switch apple iphone ya
4,just realized reason got twitter ios5 thanks a...


### Top 50 Words

In [8]:
# TF IDF vectorizer
tfidf_vect = TfidfVectorizer()

# Fit and transform
matrix_tfidf = tfidf_vect.fit_transform(df['TweetText'])

# using get_feature_names_out 
featureNames = tfidf_vect.get_feature_names_out()

# Data frame for our matrix_tfidf and featureNames
df_tfidf = pd.DataFrame(data=matrix_tfidf.toarray(), columns=featureNames)

# Adding up the importance scores (= TF-IDF scores) for every word.
wordScores = df_tfidf.sum(axis=0)

# Sorting words according to how much they matter in all the tweets
# Sorting them with their overall TF-IDF scores.
top50words = wordScores.sort_values(ascending=False).head(50)

# print top50words
print(top50words)

twitter             179.514795
google              169.866767
microsoft           145.447217
apple               129.501805
android             107.612097
rt                   87.482604
nexus                59.686031
samsung              56.167924
new                  47.798535
galaxy               46.709653
sandwich             45.805192
en                   44.867273
cream                44.747461
ice                  44.448694
iphone               42.580752
que                  41.898920
el                   36.836136
ics                  35.784983
facebook             35.629049
windows              33.594248
phone                32.689083
40                   30.121356
la                   29.115044
like                 28.376681
siri                 28.374197
just                 28.152020
icecreamsandwich     27.402011
ballmer              26.559059
im                   24.735075
steve                23.292055
galaxynexus          22.646455
app                  22.072598
nexuspri