In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes  import BernoulliNB
import nltk# FOR TEXT PROCESSING
from wordcloud import WordCloud,STOPWORDS
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer#TEXT PROCESSING
import missingno as mns# CHECKING FOR MISSING
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
nltk.download('stopwords')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Simeon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df_tweet = pd.read_csv('Tweets.csv')
df_tweet.head()

In [None]:
mns.bar(df_tweet,color='black',sort='descending')

In [None]:
df_tweet['airline_sentiment'].unique()

In [None]:
new_data = df_tweet[df_tweet['airline_sentiment'] == 'negative']
words = ' '.join(new_data['text'])
cleaned_word = ' '.join([word for word in words.split()
                    if 'http' not in word and  not word.startswith('@')
                    and word !='RT'])
wordcloud = WordCloud(stopwords =STOPWORDS,
                      background_color ='gray',
                      width = 2000,
                      height=1500).generate(cleaned_word)
plt.figure(1,figsize=(10,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
new_data = df_tweet[df_tweet['airline_sentiment'] == 'positive']
words = ' '.join(new_data['text'])
cleaned_word = ' '.join([word for word in words.split()
                    if 'http' not in word and  not word.startswith('@')
                    and word !='RT'])
wordcloud = WordCloud(stopwords =STOPWORDS,
                      background_color ='gray',
                      width = 2000,
                      height=1500).generate(cleaned_word)
plt.figure(1,figsize=(10,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
new_data = df_tweet[df_tweet['airline_sentiment'] == 'neutral']
words = ' '.join(new_data['text'])
cleaned_word = ' '.join([word for word in words.split()
                    if 'http' not in word and  not word.startswith('@')
                    and word !='RT'])
wordcloud = WordCloud(stopwords =STOPWORDS,
                      background_color ='gray',
                      width = 2000,
                      height=1500).generate(cleaned_word)
plt.figure(1,figsize=(10,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
df_tweet['airline'].unique()

In [None]:
import warnings
warnings.filterwarnings('ignore')
airlines = ['Virgin America', 'United', 'Southwest', 'Delta', 'US Airways','American']
plt.figure(figsize = (15,15))
for i in airlines:
    indices = airlines.index(i)
    plt.subplot(2,3, indices + 1)
    new_value = df_tweet[df_tweet['airline']==i]
    print(new_value['airline_sentiment'].value_counts(),i)
    index = [1,2,3]
    sns.countplot(new_value['airline_sentiment'])
    plt.title('count of mood of ' + i)
    

In [None]:
airlines = ['Virgin America', 'United', 'Southwest', 'Delta', 'US Airways','American']

for i in airlines:
    indices = airlines.index(i)
    plt.figure(figsize = (40,80))
    plt.subplot(6,1, indices + 1)
    new_value = df_tweet[df_tweet['airline']==i]
    print(new_value['negativereason'].value_counts(),i)
    index = [1,2,3,4,5,6,7,8,9,10]
    sns.countplot(new_value['negativereason'])
    plt.title('count of mood of ' + i)
    

In [None]:
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
def preprocess(x, stem =False):
    x = re.sub(text_cleaning_re,' ',str(x).lower()).strip()
    tokens = []
    for token in x.split('\n'):
        if token not in stop_words:
            if stem:
                tokens.append(SnowballStemmer.stem(token))
            else:
                tokens.append(token)
            return ' '.join(tokens)
df_tweet.text = df_tweet.text.apply(lambda x:preprocess(x))

In [None]:
df_tweet['sentiment']=df_tweet['airline_sentiment'].apply(lambda x:'NEGATIVE' if x =='negative' else 'POSITIVE')

In [None]:
y= df_tweet['sentiment']
x_train,x_test,y_train,y_test = train_test_split(df_tweet.text, y, test_size = 0.2,random_state = 1)

In [None]:
textclf1 = Pipeline([('vector',CountVectorizer()),('transformer',TfidfTransformer()),('clf', BernoulliNB())])
textclf2 = Pipeline([('vector',CountVectorizer()),('transformer',TfidfTransformer()),('clf', MLPClassifier(hidden_layer_sizes=(10,10)))])

In [None]:
textclf1.fit(x_train,y_train)

In [None]:
pred = textclf1.predict(x_test)

In [None]:
print(classification_report(y_test,pred))

In [None]:
cm = confusion_matrix(y_test,pred)
sns.heatmap(cm,annot =True,fmt='0.5g')