In [1]:
import itertools
import pandas as pd
import numpy as np
import string
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score, confusion_matrix

In [2]:
df = pd.read_csv(r'D:\Dataset\twitter-airline-sentiment\Tweets.csv', 'Tweets', delimiter=',')
df = df.fillna('NA')
df = df[['airline_sentiment','negativereason', 'text']]
print(df.columns)
print(df.head())

Index(['airline_sentiment', 'negativereason', 'text'], dtype='object')
  airline_sentiment negativereason  \
0           neutral             NA   
1          positive             NA   
2           neutral             NA   
3          negative     Bad Flight   
4          negative     Can't Tell   

                                                text  
0                @VirginAmerica What @dhepburn said.  
1  @VirginAmerica plus you've added commercials t...  
2  @VirginAmerica I didn't today... Must mean I n...  
3  @VirginAmerica it's really aggressive to blast...  
4  @VirginAmerica and it's a really big bad thing...  


In [3]:
def cleanText(text):
    text = " ".join(filter(lambda x:x[0]!='@', text.split()))
    
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    no_punct = ""
    for char in text:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct

df['text'] = df['text'].map(lambda x: cleanText(x))

In [4]:
df['text'].head()

0                                            What said
1    plus youve added commercials to the experience...
2    I didnt today Must mean I need to take another...
3    its really aggressive to blast obnoxious enter...
4              and its a really big bad thing about it
Name: text, dtype: object

In [5]:
sentiment_LE = preprocessing.LabelEncoder()
df['airline_sentiment'] = sentiment_LE.fit_transform(df['airline_sentiment'])

negativereason_LE = preprocessing.LabelEncoder()
df['negativereason'] = negativereason_LE.fit_transform(df['negativereason'])

In [6]:
vectorizer = TfidfVectorizer(strip_accents='unicode', max_features=25000,  analyzer='word', ngram_range=(1,5), norm='l2')
vectorizer.fit(df['text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=25000, min_df=1,
        ngram_range=(1, 5), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [7]:
X = df['text']
y = df[['airline_sentiment', 'negativereason']]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

In [8]:
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [9]:
classifier = ClassifierChain(OneVsRestClassifier(LinearSVC(class_weight="balanced", max_iter=2000)))
classifier.fit(X_train_vec, y_train)

ClassifierChain(classifier=OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=2000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1),
        order=None, require_dense=[True, True])

In [10]:
y_pred = classifier.predict(X_test_vec)
predicted_df = pd.DataFrame(y_pred.toarray(), columns = ['airline_sentiment', 'negativereason'])

In [11]:
print("Accuracy of airline_sentiment - "+str(accuracy_score(y_test['airline_sentiment'], predicted_df['airline_sentiment'])))
print("Accuracy of negativereason - "+str(accuracy_score(y_test['negativereason'], predicted_df['negativereason'])))

Accuracy of airline_sentiment - 0.7950819672131147
Accuracy of negativereason - 0.6154371584699454
