In [1]:
# including the required libraries
import numpy  as np 
import pandas as pd
import matplotlib.pyplot as plt
import os
cwd = os.getcwd()

In [2]:
# getting the corresponding data sets.
dataFrameTrainFeatures = pd.read_csv( os.path.join(cwd, "data", "question-4-train-features.csv"),  header=None)
dataFrameTrainLabels   = pd.read_csv( os.path.join(cwd, "data", "question-4-train-labels.csv"),    header=None)
dataFrameVocabularies  = pd.read_csv( os.path.join(cwd, "data", "question-4-vocab.txt"), sep="\t", header=None)

In [3]:
# stacking the train labels
npTrain = np.hstack((dataFrameTrainLabels.values, dataFrameTrainFeatures.values))

In [4]:
# printing the stacked matris
print('*****************************')
print(npTrain)
print('*****************************')

*****************************
[['neutral' 1 0 ... 0 0 0]
 ['positive' 1 1 ... 0 0 0]
 ['neutral' 1 0 ... 0 0 0]
 ...
 ['negative' 0 0 ... 0 0 0]
 ['negative' 0 0 ... 0 0 0]
 ['positive' 0 0 ... 0 0 0]]
*****************************


In [5]:
# getting the corresponding data according to the tweet type.
frameTrain      = pd.DataFrame(npTrain)
npTrainNeutral  = frameTrain[frameTrain[0] == 'neutral'].loc[:, 1:frameTrain.shape[1]].values
npTrainPositive = frameTrain[frameTrain[0] == 'positive'].loc[:, 1:frameTrain.shape[1]].values
npTrainNegative = frameTrain[frameTrain[0] == 'negative'].loc[:, 1:frameTrain.shape[1]].values

# getting the required numbers for calculations.
wordCount = npTrainNeutral.shape[1]
trainNeutralCount  = npTrainNeutral.shape[0]
trainPositiveCount = npTrainPositive.shape[0]
trainNegativeCount = npTrainNegative.shape[0]

In [6]:
# printing the required numbers.
print('WORD COUNT: ' + str(wordCount))
print('TRAIN NEUTRAL  COUNT: ' + str(trainNeutralCount))
print('TRAIN POSITIVE COUNT: ' + str(trainPositiveCount))
print('TRAIN NEGATIVE COUNT: ' + str(trainNegativeCount))

WORD COUNT: 5722
TRAIN NEUTRAL  COUNT: 2617
TRAIN POSITIVE COUNT: 2004
TRAIN NEGATIVE COUNT: 7091


In [7]:
# printing the train matrices.
print('-------TRAIN NEUTRAL---------')
print(npTrainNeutral)
print('*****************************')

print('-------TRAIN POSITIVE--------')
print(npTrainPositive)
print('*****************************')

print('-------TRAIN NEGATIVE--------')
print(npTrainNegative)
print('*****************************')

-------TRAIN NEUTRAL---------
[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
*****************************
-------TRAIN POSITIVE--------
[[1 1 1 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
*****************************
-------TRAIN NEGATIVE--------
[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
*****************************


In [8]:
# calculating the number of occurrences of word J.
npNumOfWordJOccNeutral  = npTrainNeutral.sum(axis=0)  # Tj,neutral
npNumOfWordJOccPositive = npTrainPositive.sum(axis=0) # Tj,positive
npNumOfWordJOccNegative = npTrainNegative.sum(axis=0) # Tj,negative

# forming the word of occurrences
pdNumOfWordJOccNeutral  = pd.DataFrame([npNumOfWordJOccNeutral])
pdNumOfWordJOccPositive = pd.DataFrame([npNumOfWordJOccPositive])
pdNumOfWordJOccNegative = pd.DataFrame([npNumOfWordJOccNegative])

pdNumOfWordJOccNeutral  = pdNumOfWordJOccNeutral.T
pdNumOfWordJOccPositive = pdNumOfWordJOccPositive.T
pdNumOfWordJOccNegative = pdNumOfWordJOccNegative.T

In [9]:
# finding the mostly appeared 20 words.
pdMostFrequentlyOccurredWordsPositive = pdNumOfWordJOccPositive.nlargest(20, columns=[0])
pdMostFrequentlyOccurredWordsNegative = pdNumOfWordJOccNegative.nlargest(20, columns=[0])
pdMostFrequentlyOccurredWordsNeutral  = pdNumOfWordJOccNeutral.nlargest(20, columns=[0])

<h1 align="center">INTERPRETATION OF TWEETS THAT HAS THE CLASS NEUTRAL</h1>
<hr>

<h3>Most Occurred Words in Neutral Tweets</h3>
<ol>
  <li>@jetblue</li>
  <li>@united</li>
  <li>@southwestair</li>
  <li>flight</li>
  <li>@usairways</li>
  <li>@virginamerica</li>
  <li>flights</li>
  <li>help</li>
  <li>fleek</li>
  <li>fleet's</li>
  <li>dm</li>
  <li>time</li>
  <li>tomorrow</li>
  <li>flying</li>
  <li>cancelled</li>
  <li>fly</li>
  <li>change</li>
  <li>today</li>
  <li>travel</li>
  <li>check</li>
</ol>
In being neutral, the most common words are as expected because we have some words that indicates that the tweets are about customer satisfaction for the airlines in the US. These words are <strong>@jetblue, @southwestair, @virginamerica, flights</strong>. In addition, there aren't words that aparently reflect whether these tweets are positive or negative. That's why, it is logical to interpret these tweets as <strong>neutral.</strong>


In [10]:
# finding out the corresponding vocabularies.
dataFrameVocabularies.iloc[pdMostFrequentlyOccurredWordsNeutral.index,:]

Unnamed: 0,0,1
301,@jetblue,2221
572,@united,3890
300,@southwestair,2455
16,flight,3948
2151,@usairways,2998
0,@virginamerica,517
161,flights,648
76,help,872
4794,fleek,152
4799,fleet's,144


<h1 align="center">INTERPRETATION OF THE TWEETS THAT HAS THE CLASS POSITIVE</h1>
<hr>

<ol>
  <li>@southwestair</li>
  <li>@jetblue</li>
  <li>@united</li>
  <li>flight</li>
  <li>@usairways</li>
  <li>great</li>
  <li>@virginamerica</li>
  <li>service</li>
  <li>love</li>
  <li>best</li>
  <li>guys</li>
  <li>customer</li>
  <li>time</li>
  <li>awesome</li>
  <li>help</li>
  <li>airline</li>
  <li>amazing</li>
  <li>today</li>
  <li>fly</li>
  <li>flying</li>
</ol>

In being positive, the most common words are as expected because we have some words that indicates that the tweets are about customer satisfaction for the airlines in the US. These words are <strong>@jetblue, @southwestair, @virginamerica, flights</strong>. In addition, there are also some words that reflects that the twees are <strong>positive</strong> such as <strong>great, love, best, awesome, amazing or guys.</strong>

In [11]:
# finding out the corresponding vocabularies.
dataFrameVocabularies.iloc[pdMostFrequentlyOccurredWordsPositive.index,:]

Unnamed: 0,0,1
300,@southwestair,2455
301,@jetblue,2221
572,@united,3890
16,flight,3948
2151,@usairways,2998
45,great,336
0,@virginamerica,517
157,service,963
87,love,245
174,best,191


<h1 align="center">INTERPRETATION OF THE TWEETS THAT HAS THE CLASS NEGATIVE</h1>
<hr>

<ol>
    <li>@united</li>
    <li>flight</li>
    <li>@usairways</li>
    <li>@southwestair</li>
    <li>@jetblue</li>  
    <li>cancelled</li>
    <li>service</li>
    <li>hours</li>
    <li>hold</li>
    <li>time</li>
    <li>customer</li>
    <li>help</li>
    <li>delayed</li>
    <li>plane</li>
    <li>hour</li>
    <li>flights</li>
    <li>bag</li>
    <li>gate</li>
    <li>late</li>
    <li>flightled</li>
</ol>

In being negative, the most common words are as expected because we have some words that indicates that the tweets are about customer satisfaction for the airlines in the US. These words are <strong>@jetblue, @southwestair, @virginamerica, flights</strong>. In addition, there are also some words that reflects that the twees are <strong>negative</strong> such as <strong>delayed, cancelled, late, hours, hold or cancelled.</strong>

In [12]:
# finding out the corresponding vocabularies.
dataFrameVocabularies.iloc[pdMostFrequentlyOccurredWordsNegative.index,:]

Unnamed: 0,0,1
572,@united,3890
16,flight,3948
2151,@usairways,2998
300,@southwestair,2455
301,@jetblue,2221
309,cancelled,1065
157,service,963
140,hours,684
247,hold,644
21,time,791
