## Data Analysis Question: Which of the American Airlines is the least favourable among passengers.

Data Source: kaggle.com
    
Data Analyst: Alex Idachaba

In [59]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import date

In [60]:
# Load dataset
sentiments = pd.read_csv('Tweets.csv')
sentiments.head(3)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,5.70306e+17,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,24/02/2015 11:35,,Eastern Time (US & Canada)
1,5.70301e+17,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,24/02/2015 11:15,,Pacific Time (US & Canada)
2,5.70301e+17,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,24/02/2015 11:15,Lets Play,Central Time (US & Canada)


In [61]:
# Look at general info
sentiments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
tweet_id                        14640 non-null float64
airline_sentiment               14640 non-null object
airline_sentiment_confidence    14640 non-null float64
negativereason                  9178 non-null object
negativereason_confidence       10522 non-null float64
airline                         14640 non-null object
airline_sentiment_gold          40 non-null object
name                            14640 non-null object
negativereason_gold             32 non-null object
retweet_count                   14640 non-null int64
text                            14640 non-null object
tweet_coord                     1019 non-null object
tweet_created                   14640 non-null object
tweet_location                  9907 non-null object
user_timezone                   9820 non-null object
dtypes: float64(3), int64(1), object(11)
memory usage: 1.7+ MB


In [62]:
# Check for duplicate rows
sentiments.duplicated().any()

True

In [63]:
# Drop duplicate rows
sentiments.drop_duplicates(inplace=True)

In [64]:
# check column names
sentiments.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [65]:
# Rename some columns
sentiments.rename(columns={'negativereason':'negative_reason', 'tweet_created':'date_created'}, inplace=True)

In [66]:
# Convert date to datetime
sentiments['date_created'] = pd.to_datetime(sentiments['date_created'], yearfirst=True)

In [67]:
# Lets get data for 19th Feb 2015
start_date = pd.Timestamp(date(2015,2,19))
end_date = pd.Timestamp(date(2015,2,20))
sent_day = sentiments[(sentiments['date_created'] >= start_date) & (sentiments['date_created'] < end_date)]
sent_day.head(3)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negative_reason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,date_created,tweet_location,user_timezone
320,5.68663e+17,negative,1.0,Customer Service Issue,0.6596,Virgin America,,davidhfe,,0,@virginAmerica Other carriers are less than ha...,,2015-02-19 22:47:00,"San Francisco, CA",Pacific Time (US & Canada)
321,5.68662e+17,negative,1.0,Flight Booking Problems,0.6841,Virgin America,,davidhfe,,0,@VirginAmerica WTF is happening in PDX Late Fl...,,2015-02-19 22:44:00,"San Francisco, CA",Pacific Time (US & Canada)
322,5.68646e+17,neutral,0.6667,,,Virgin America,,chet,,0,@VirginAmerica add DTW and I'm sold!,,2015-02-19 21:39:00,,Pacific Time (US & Canada)


In [68]:
# Filter airline sentiment by positive and negative
mask = sent_day['airline_sentiment'].isin(['positive','negative'])
sent_day = sent_day[mask]

In [69]:
# Lets groupby airlines
grp = sent_day.groupby(['airline', 'airline_sentiment'])['airline_sentiment'].count()
grp

airline         airline_sentiment
American        positive               1
Delta           negative             135
                positive              78
Southwest       negative             127
                positive              96
US Airways      negative             193
                positive              32
United          negative             272
                positive              69
Virgin America  negative              24
                positive              20
Name: airline_sentiment, dtype: int64

### United Airline has the most Negative sentiments of 272 points.

In [70]:
# Lets see the most customer negative reason
grp = sent_day.groupby(['airline', 'negative_reason'])['negative_reason'].count().sort_values(ascending=False)
grp

airline         negative_reason            
United          Customer Service Issue         84
                Late Flight                    71
US Airways      Late Flight                    54
                Customer Service Issue         50
United          Can't Tell                     38
Delta           Customer Service Issue         31
Southwest       Late Flight                    30
                Customer Service Issue         26
                Can't Tell                     26
Delta           Late Flight                    25
US Airways      Can't Tell                     23
Delta           Can't Tell                     20
                Cancelled Flight               19
US Airways      Bad Flight                     19
                Flight Booking Problems        19
Southwest       Bad Flight                     18
United          Lost Luggage                   16
                Cancelled Flight               14
                Bad Flight                     14
      

### United Airline leads with the top two negative reasons of:

### 1. Customer Service Issue : 84

### 2.  Late Flight : 71

In [71]:
# Create 2 columns for both negative and positive ratings

def positive(value):
    if value == 'positive':
        return 'positive'
    

def negative(value):
    if value == 'negative':
        return 'negative'

In [73]:
sent_day['positive_sentiment'] = sent_day['airline_sentiment'].apply(positive)
sent_day['negative_sentiment'] = sent_day['airline_sentiment'].apply(negative)

In [74]:
# Export modified data
sent_day.to_csv('Tweets_modified.csv', encoding='utf-8', index=False)