# Sentiment Analysis
What is this project about?

# 1. Import Libraries
Why are we using these libraries?

In [50]:
# import libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

# 2. Data Import and Overview

In [51]:
data = pd.read_csv('Tweets.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
#checking unique values 
data.nunique()

In [None]:
#checking null values in our data
data.isnull().sum()

# 2. Pre-Processing Data

In [7]:
# convert 'tweet_created' to datetime
data['tweet_created'] = pd.to_datetime(data['tweet_created']).dt.date

In [None]:
data.info()

In [None]:
#checking uniques values in tweet_created columns
data['tweet_created'].nunique()

In [10]:
numberoftweets = data.groupby('tweet_created').size()

In [None]:
numberoftweets.sort_values(ascending=False)

## 2.2 Dealing with Null Values

In [None]:
data.isna().sum()

[SUGGESTION]: **airline_sentiment_gold, negativereason_gold** have more than 99% missing data And **tweet_coord** have nearly 93% missing data. It will be better to delete these columns as they will not provide any meaningful information.

In [None]:
data.columns

In [14]:
data=data.drop(['airline_sentiment_gold', 'negativereason_gold','tweet_coord'], axis=1)

In [None]:
data.head()

## 2.3 What is the biggest reason for having negative comments?

In [16]:
neg_reason = data.groupby('negativereason').size()

In [None]:
neg_reason.sort_values(ascending=False)

# 3. EDA

## 3.1 Type of Sentiment

In [None]:
counter = data.airline_sentiment.value_counts()
index = [1,2,3]
plt.bar(index,counter,color=['green','red','blue'])
plt.xticks(index,['negative','neutral','positive'],rotation=60)
plt.xlabel('Sentiment Type')
plt.ylabel('Sentiment Count')
plt.title('Count of Type of Sentiment')

## 3.2 Airline sentiments for each airline

In [None]:
#checking differtent airlines we have
data['airline'].unique()

In [None]:
airlines= ['US Airways','United','American','Southwest','Delta','Virgin America']
plt.figure(figsize=(12, 12))

for i in airlines:
    indices= airlines.index(i)
    plt.subplot(2,3,indices+1)
    new_df=data[data['airline']==i]
    count=new_df['airline_sentiment'].value_counts()
    Index = [1,2,3]
    plt.bar(Index,count, color=['red', 'green', 'blue'])
    plt.xticks(Index,['negative','neutral','positive'])
    plt.ylabel('Mood Count')
    plt.xlabel('Mood')
    plt.title('Count of Moods of '+i)

## 3.3 Airlines by Negative sentiment

In [21]:
negative_data = data.loc[data['airline_sentiment'] == 'negative']

In [None]:
negative_data.airline.value_counts()

In [None]:
negative_data.airline.value_counts().plot.bar()
plt.xlabel('Airlines')
plt.ylabel('Count of negative tweets')

## 3.4 Airline by all sentiment

In [None]:
airline_all_sentiment = data.groupby(['airline', 'airline_sentiment']).size()
airline_all_sentiment.unstack().plot(kind='bar', stacked=True, figsize=(15,10))

## 3.5 Is there a relationship between negative sentiments and date?

In [None]:
negative_data.info()

In [None]:
sns.countplot(data=negative_data, x='tweet_created', hue='airline')
plt.xticks(rotation=90)
plt.show()

In [27]:
#!pip install wordcloud

In [28]:
from wordcloud import WordCloud,STOPWORDS

## 3.6 Word cloud of Positive sentiment

In [None]:
new_df=data[data['airline_sentiment']=='positive']
words = ' '.join(new_df['text'])
cleaned_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'
                            ])
wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(cleaned_word)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

## 3.7 Word cloud of Negative sentiment

In [None]:
new_df=data[data['airline_sentiment']=='negative']
words = ' '.join(new_df['text'])
cleaned_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and word != 'RT'
                            ])
wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color='black',
                      width=3000,
                      height=2500
                     ).generate(cleaned_word)
plt.figure(1,figsize=(12, 12))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

# 4. Natural Language Processing

### Text Representation
- In Natural Language Processing (NLP) the conversion of raw-text to numerical form is called <b>Text Representation</b>
- This step is most important in the NLP pipeline because if we feed inappropriate data, our prediction will be useless.

If you have good `Text Represntation` and if you use any ordinary algorithm, you will get much better result then if you use highend APIs and algorithm with bad or poor `Text Representation`.


### Basic Text Pre-Processing

**1. Stop-Word Removal** : In English words like a, an, the, as, in, on, etc. are considered as stop-words so according to our requirements we can remove them to reduce vocabulary size as these words don't have some specific meaning

**2. Lower Casing** : Convert all words into the lower case because the upper or lower case may not make a difference for the problem.
And we are reducing vocabulary size by doing so. 

**3. Stemming** : Stemming refers to the process of removing suffixes and reducing a word to some base form such that all different variants of that word can be represented by the same form (e.g., “walk” and “walking” are both reduced to “walk”).

**4. Tokenization** : NLP software typically analyzes text by breaking it up into words (tokens) and sentences.

In [31]:
df = data.copy()

In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def clean_text(d):
    pattern = r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', d)
    return text

names = ['delta', 'deltaair', 'united', 'unitedair', 'southwest', 'southwestair', 'usairways',
         'virginamerica', 'american', 'americanair', 'jetblue', 'jetblues', 'usairway',
         'flight', 'airline', 'airlines']

# the words related the name of airlines are not relevant to the sentiment analysis
# Therefore I decided to append the above names into the list of stop words.
# you can also append any names in to the list for custom cleaning.

def clean_stopword(d):
    stop_words = stopwords.words('english')
    for name in names:
        stop_words.append(name)
    return " ".join([w.lower() for w in d.split() if w.lower() not in stop_words and len(w) > 1])

def tokenize(d):
    return word_tokenize(d)

In [None]:
df['final_text']= df.text.apply(clean_text).apply(clean_stopword).apply(tokenize)
df.final_text.head()

In [None]:
print(" ".join(df.final_text[0]),'\n')
print(" ".join(df.final_text[1]),'\n')
print(" ".join(df.final_text[100]),'\n')
print(" ".join(df.final_text[1000]),'\n')

# 5. Text Analysis

## 5.1 Lexicon based approach

The lexicon-based approach involves calculating orientation for a document from the semantic orientation of words or phrases in the document.it uses dictionaries of words annotated with the word's semantic orientation, or polarity.

Here we will just try to verify the lexicon works hopefully very well to classify the sentiment which has already been allocated by the real customers.

### 5.1.1 Vader

Vader is a lexicon made for the sentiment analysis of text data on social media.

It returns the compounded polarity score of a sentence after calculating the negative, the neutral, and the positive scores of each word in a sentence.

We can use Vader from the innner module in nltk as well as the Python API [vaderSentiment](https://pypi.org/project/vaderSentiment/).

VADER has many advantages over traditional sentiment analysis methods: [source](https://ichi.pro/ko/python-eseo-vaderleul-sayonghayeo-gamjeong-bunseog-dansunhwa-sosyeol-midieo-tegseuteu-274770204542255)

* **It works very well with social media type text**, but generalizes easily to multiple domains.
* It does not require any training data, but consists of a generalized, balence-based, human-curated gold standard psychological vocabulary.
* It's fast enough to use online with streaming data,
* The speed-performance trade-off isn't serious.



In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()
vader.polarity_scores(" ".join(df.final_text[0]))

In [None]:
texts = [" ".join(df.final_text[i]) for i in range(len(df))]

print(df.text[0])
print(texts[0])
print(vader.polarity_scores(texts[0]), f'--> Actual Classification: {df.airline_sentiment[0]}', '\n')

print(df.text[1])
print(texts[1])
print(vader.polarity_scores(texts[1]), f'--> Actual Classification: {df.airline_sentiment[1]}', '\n')

print(df.text[10])
print(texts[10])
print(vader.polarity_scores(texts[10]), f'--> Actual Classification: {df.airline_sentiment[10]}', '\n')

print(df.text[100])
print(texts[100])
print(vader.polarity_scores(texts[100]), f'--> Actual Classification: {df.airline_sentiment[100]}', '\n')


## 5.2 Machine Learning approach

ML approach is also known as Document Classificaton. It uses ML/DL algorithm to classify the text data.

The most important in ML approach is to convert the text to the vector or other numeric format to make the algorithms understand the text as a feature.


* Vectorization : `CountVectorizer`, `TfidfTransformer`, `Word2Vec`
* Model: Logistic Regression, RandomForest, SupportVectorMachine

In [37]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
df.final_text

In [None]:
for i in range(len(df)):
    df['final_text'][i] = " ".join(df['final_text'][i])

In [None]:
df['final_text'][:3]

In [41]:
def convert_Sentiment(sentiment):
    if  sentiment == "positive":
        return 2
    elif sentiment == "neutral":
        return 1
    elif sentiment == "negative":
        return 0
    
df.airline_sentiment = df.airline_sentiment.apply(lambda x : convert_Sentiment(x))

In [42]:
X = df.final_text
y = df.airline_sentiment

In [None]:
X.head()

In [None]:
y.head()

### 5.2.1 CountVectorizer & TfidfVectorizer

In [45]:
# model_params = {'random_state':42}
# model_list = [LogisticRegression(**model_params, solver='liblinear'),
#               RandomForestClassifier(**model_params),
#               SVC(**model_params)]
# model_names = ['LogisticRegression', 'RandomForest', 'SupportVectorMachine']

# c = CountVectorizer()
# X_c = c.fit_transform(X)

# tf = TfidfVectorizer()
# X_tf = tf.fit_transform(X)

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# for model, model_name in zip(model_list, model_names):
#     for n_fold, (trn_idx, vld_idx) in enumerate(skf.split(df.index, df.airline_sentiment)):        
#         X_c_trn = X_c[trn_idx]
#         X_tf_trn = X_tf[trn_idx]
#         y_trn = df.loc[trn_idx, 'airline_sentiment']
        
#         X_c_vld = X_c[vld_idx]
#         X_tf_vld = X_tf[vld_idx]
#         y_vld = df.loc[vld_idx, 'airline_sentiment']        
        
#         model.fit(X_c_trn, y_trn)
#         c_pred_col = f"{model_name}_ct_pred"
#         df.loc[vld_idx, c_pred_col] = model.predict(X_c_vld)
        
#         model.fit(X_tf_trn, y_trn)
#         tf_pred_col = f"{model_name}_tf_pred"
#         df.loc[vld_idx, tf_pred_col] = model.predict(X_tf_vld)

#     print(f"Model: {model_name}, CountVectorizer, Accuracy: {accuracy_score(df.airline_sentiment, df[c_pred_col]):.3%}")
#     print(f"Model: {model_name}, TfidfVectorizer, Accuracy: {accuracy_score(df.airline_sentiment, df[tf_pred_col]):.3%}\n")

### 5.2.2 Word2Vec 

Word2Vec is one of the most popular model to represent a word in a large text corpus as a vector in n-dimensional space.

There are two kinds of W2V, Continuous Bag-of-Words(CBOW) and Skip-Gram.

Skip-gram is used to predict the context word for a given target word. It’s reverse of CBOW algorithm. Here, target word is input while context words are output.

In most case it is known that the predictability of skip-gram is better than the one of CBOW.

We can use `Word2Vec` library from `gensim` and set the option `sg` which is the abbreviation of skip-gram.

In [46]:
from gensim.models import Word2Vec

texts_w2v = df.final_text.apply(tokenize).to_list()

w2v = Word2Vec(sentences = texts_w2v, window = 3,
               vector_size = 100, min_count = 5, workers = 4, sg = 1)

In [None]:
texts_w2v[:3]

<br>

`Word2Vec` imported from `gensim` trains the texts at the same time as it is declared by the user. 

We can find the similar words with the given word and the examples are represented below.

<br>

In [None]:
w2v.wv.most_similar('thank')

In [None]:
w2v.wv.most_similar('customerservice')

# 6. Conclusions - Suggestions