### 1. Importing Libraries

In [1]:
import re
import nltk
import joblib
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# nltk.download('stopwords')
# nltk.download('punkt')

stemmer = PorterStemmer()

### 2. Importing Dataset

In [2]:
df = pd.read_csv('tweets/twitter_validation.csv', header = None)
df = df.rename({0 : 'id', 1 : 'company', 2 : 'sentiment', 3 : 'raw_tweet'}, axis = 1)


df = df[df['sentiment'] != 'Irrelevant']
df = df[df['sentiment'] != 'Neutral']


tweets = df['raw_tweet'].values

### 3. Text Preprocessing

In [3]:
processed_tweets = []

for tweet in tweets:
    
    tweet = re.sub(r'[^a-zA-Z]',' ',tweet)    
    tweet = tweet.lower()
    tweet = [word for word in tweet.split(' ') if not word in stopwords.words('english')]
    tweet = [stemmer.stem(word) for word in tweet]
    tweet = [word for word in tweet if len(word) != 0]
    tweet = ' '.join(tweet)

    processed_tweets.append(tweet)

### 4. TF-IDF 

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

enc_tweets = tfidf.fit_transform(processed_tweets)

df_ = pd.DataFrame(enc_tweets.toarray(), columns = tfidf.get_feature_names())
df_

Unnamed: 0,abil,abl,absolut,absurd,access,accessibleatx,accomplish,account,aceofpyrit,achiev,...,ziryhrf,zlcc,zone,zoom,zqw,ztc,ztl,zukf,zy,zyot
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.193864,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 5. Saving the Model and Encodings

In [155]:
joblib.dump(tfidf, 'tfidf_model.joblib')

df.to_csv('tfidf_enc.csv', index = False)

print('Model is saved with name tfidf_model.joblib!')
print('Encodings are saved with name tfidf_enc.csv')

Model is saved with name tfidf_model.joblib!
Encodings are saved with name tfidf_enc.csv


In [156]:
joblib.load('tfidf_model.joblib')

TfidfVectorizer()