# Twitter Sentiment Analysis using Machine Learning

In [4]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Data processing

### Loading the data from CSV file to a Dataframe

In [6]:
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

### Checking the number of rows and columns

In [7]:
twitter_data.shape

(1599999, 6)

In [8]:
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


### Naming the columns and reading the dataset again

In [None]:
column_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names=column_names, encoding='ISO-8859-1')

In [11]:
twitter_data.shape
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


### Counting the number of missing values in the dataset

In [12]:
twitter_data.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


### Checking the distribution of target column

In [13]:
twitter_data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


### Convert the target "4" to "1"

In [14]:
twitter_data.replace({'target': {4: 1}}, inplace=True)

0 --> Negative Tweet

1 --> Positive Tweet

### Stemming (process of reducing a word to its root word)

In [17]:
port_stem = PorterStemmer()

In [18]:
def stemming(content):

  stemmed_content = re.sub('[^a-zA-z]', ' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [None]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [None]:
print(twitter_data['stemmed_content'])

### Separating the data and label

In [None]:
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

### Splitting the data to training data and test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

### Converting the textual data into numerical data

In [None]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

## Training the Machine Learning Model

### Logistic regression

In [None]:
model = LogisticRegression(max_iter=1000)

In [None]:
model.fit(X_train, Y_train)

### Model evaluation

In [None]:
# Accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [None]:
print('Accuracy score on the training data :', training_data_accuracy)

In [None]:
# Accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [None]:
print('Accuracy score on the test data :', test_data_accuracy)

### Saving the trained model

In [None]:
import pickle

filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

### Using the saved model for future predictions

In [None]:
loaded_model = pickle.load(open('/content/trained_model.sav', 'rb'))

In [None]:
X_new = X_test[3]

prediction = loaded_model.predict(X_new)
print(prediction)

if prediction[0] == 0:
  print('Negative Tweet')
else:
  print('Positive Tweet')