<a href="https://colab.research.google.com/github/AagamanVarma/twitter-sentiment-analysis/blob/main/Twitter_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
# 1. Make kaggle directory
!mkdir -p ~/.kaggle

# 2. Copy kaggle.json into this directory (after uploading in Colab file system)
!cp kaggle.json ~/.kaggle/

# 3. Give proper permissions
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d kazanova/sentiment140


In [None]:
# extracting the compressed dataset
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset, 'r') as zip:
  zip. extractall()
  print('The dataset is extracted')

In [5]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')
print(stopwords.words('english'))

In [None]:
# Loading the data from csv file to pandas dataframe
col_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv(
    '/content/training.1600000.processed.noemoticon.csv',
    encoding='ISO-8859-1',
    names=col_names
)

# Checking first few rows
print(twitter_data.head())


In [8]:
twitter_data['target'] = twitter_data['target'].replace(4, 1)


In [None]:
twitter_data.isnull().sum()

In [10]:
ps = PorterStemmer()

In [11]:
def stemming(content):
    # keep only letters
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    # lowercase
    stemmed_content = stemmed_content.lower()
    # tokenize
    stemmed_content = stemmed_content.split()
    # remove stopwords + stemming
    stemmed_content = [ps.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
    # join back into a string
    stemmed_content = ' '.join(stemmed_content)

    return stemmed_content

In [12]:
# Apply stemming to all tweets
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)


In [None]:
twitter_data.head()

In [14]:
# Separating the data and label
X = twitter_data['stemmed_content'].values
y = twitter_data['target'].values


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
print(X_train)

In [None]:
print(X_test)

In [31]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
#Training the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
# Accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print('Accuracy score on the training data :', training_data_accuracy)

In [None]:
# Accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test, X_test_prediction)
print('Accuracy score on the test data :', test_data_accuracy)

In [35]:
import pickle

In [36]:
filename = 'trained_model_for_twitter_sentiment_analysis.pkl'
pickle.dump(model,open(filename,'wb'))

In [37]:
# Load the saved model
loaded_model = pickle.load(open('/content/trained_model_for_twitter_sentiment_analysis.pkl', 'rb'))


In [None]:
# Pick one test example
X_new = X_test[200]
y_true = y_test[200]
print("True label:", y_true)

# Make prediction (reshape required)
prediction = loaded_model.predict(X_new)

print("Predicted label:", prediction[0])

if prediction[0] == 0:
    print("Negative Tweet")
else:
    print("Positive Tweet")