In [3]:
from pymongo import MongoClient
import requests
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [4]:
conn = MongoClient("localhost:27017")

In [5]:
# show existing database names
conn.list_database_names()

['HP', 'MyAPI', 'admin', 'config', 'ironhack', 'local']

In [6]:
# choose database to work with
db = conn.get_database("ironhack")
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'ironhack')

In [7]:
# show collections inside database
db.list_collection_names()

['companies', 'restaurants', 'Twitter', 'countries_small', 'books']

In [8]:
# choose collection to work with
collection = db.get_collection("Twitter")
collection

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'ironhack'), 'Twitter')

In [9]:
# checking our collection info before making some calls
collection.find_one()

{'_id': ObjectId('5fc22ff0dfa555070557b086'),
 'tweet_id': '570306133677760513',
 'airline_sentiment': 'neutral',
 'airline_sentiment_confidence': '1.0',
 'negativereason': '',
 'negativereason_confidence': '',
 'airline': 'Virgin America',
 'airline_sentiment_gold': '',
 'name': 'cairdin',
 'negativereason_gold': '',
 'retweet_count': '0',
 'text': '@VirginAmerica What @dhepburn said.',
 'tweet_coord': '',
 'tweet_created': '2015-02-24 11:35:52 -0800',
 'tweet_location': '',
 'user_timezone': 'Eastern Time (US & Canada)'}

In [27]:
# Getting what we need in order to perform a sentiment analysis. For example we will take a list with 25 tweet comments.
tw = collection.find({}, {"_id": 0, "text": 1}).limit(25)
tweets = list(tw)
tweets

[{'text': '@VirginAmerica What @dhepburn said.'},
 {'text': "@VirginAmerica plus you've added commercials to the experience... tacky."},
 {'text': "@VirginAmerica I didn't today... Must mean I need to take another trip!"},
 {'text': '@VirginAmerica it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse'},
 {'text': "@VirginAmerica and it's a really big bad thing about it"},
 {'text': "@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA"},
 {'text': '@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)'},
 {'text': '@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP'},
 {'text': "@virginamerica Well, I didn't…but NOW I DO! :-D"},
 {'text': "@VirginAmerica it was amazing, and arrived an hour early. You're too good to me."},
 {'text': '@VirginAmerica did you know 

In [28]:
# Inserting the dialogues into a dataframe:
tweet_phrases = [tweet['text'] for tweet in tweets]
tweet_phrases

['@VirginAmerica What @dhepburn said.',
 "@VirginAmerica plus you've added commercials to the experience... tacky.",
 "@VirginAmerica I didn't today... Must mean I need to take another trip!",
 '@VirginAmerica it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces &amp; they have little recourse',
 "@VirginAmerica and it's a really big bad thing about it",
 "@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA",
 '@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)',
 '@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP',
 "@virginamerica Well, I didn't…but NOW I DO! :-D",
 "@VirginAmerica it was amazing, and arrived an hour early. You're too good to me.",
 '@VirginAmerica did you know that suicide is the second leading cause of death among teens 10-24',
 '@VirginAmerica I &lt;3 pretty graphic

In [31]:
tweetsdf = pd.DataFrame(tweet_phrases, columns=["Tweets"])
tweetsdf.head()

Unnamed: 0,Tweets
0,@VirginAmerica What @dhepburn said.
1,@VirginAmerica plus you've added commercials t...
2,@VirginAmerica I didn't today... Must mean I n...
3,@VirginAmerica it's really aggressive to blast...
4,@VirginAmerica and it's a really big bad thing...


In [36]:
# Loading the sentiment analyzer!
sia = SentimentIntensityAnalyzer()

In [60]:
sentence = "I flew from NYC to SFO last week and couldn't fully sit in my seat due to two large gentleman on either side of me. HELP!"

In [61]:
# Testing the sentiment analyzer:
polarity = sia.polarity_scores(sentence)
polarity

{'neg': 0.0, 'neu': 0.866, 'pos': 0.134, 'compound': 0.5754}

In [62]:
def sentimentAnalysis(sentence):
    sia = SentimentIntensityAnalyzer()
    polarity = sia.polarity_scores(sentence)
    pol = polarity['compound']
    return pol

In [64]:
tweetsdf['sentiment_compound'] = tweetsdf['Tweets'].apply(sentimentAnalysis)
tweetsdf

Unnamed: 0,Tweets,sentiment_compound
0,@VirginAmerica What @dhepburn said.,0.0
1,@VirginAmerica plus you've added commercials t...,0.0
2,@VirginAmerica I didn't today... Must mean I n...,0.0
3,@VirginAmerica it's really aggressive to blast...,-0.5984
4,@VirginAmerica and it's a really big bad thing...,-0.5829
5,@VirginAmerica seriously would pay $30 a fligh...,-0.5945
6,"@VirginAmerica yes, nearly every time I fly VX...",0.6908
7,@VirginAmerica Really missed a prime opportuni...,0.1458
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",-0.3477
9,"@VirginAmerica it was amazing, and arrived an ...",0.7717


### NOW THAT WE KNOW IT WORKS, LET'S GET WHAT WE NEED FOR OUR NEW DATABASE AND START POPULATING IT!

In [33]:
# We are going to select what we find useful to our new database and save it to a .json file:
data = pd.read_csv('../data/Tweets.csv')
data.sample(5)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
5457,568977492410703872,negative,1.0,Customer Service Issue,0.3489,Southwest,,Julia_Pabst,,0,@SouthwestAir wanting a ⭐️ for the ✈️ finding ...,,2015-02-20 19:36:19 -0800,Los Angeles,Pacific Time (US & Canada)
8249,568558887290441728,negative,1.0,Bad Flight,0.699,Delta,,superhilarious,,0,@JetBlue :/ he was trying to take stuff from t...,,2015-02-19 15:52:56 -0800,,Central Time (US & Canada)
2717,568940548301549568,negative,0.7055,Late Flight,0.3748,United,,wisemana,,0,@United can you let us out of the gate now. UA...,"[0.0, 0.0]",2015-02-20 17:09:31 -0800,"washington, dc",Eastern Time (US & Canada)
7879,569122341797691392,negative,1.0,Customer Service Issue,0.6869,Delta,,JessBarbalato,,0,@jetblue is the website down? Can't print boar...,,2015-02-21 05:11:54 -0800,,Eastern Time (US & Canada)
5784,568605955614642176,positive,1.0,,,Southwest,,MattChrisEd,,0,"@SouthwestAir appreciate the reply, hopefully ...",,2015-02-19 18:59:58 -0800,City of Angels,Pacific Time (US & Canada)


In [34]:
to_drop = ['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason_confidence', 'airline_sentiment_gold', 'negativereason_gold', 'retweet_count', 'tweet_coord', 'tweet_created', 'user_timezone']
data.drop(to_drop, axis='columns', inplace=True)
data.head()

Unnamed: 0,negativereason,airline,name,text,tweet_location
0,,Virgin America,cairdin,@VirginAmerica What @dhepburn said.,
1,,Virgin America,jnardino,@VirginAmerica plus you've added commercials t...,
2,,Virgin America,yvonnalynn,@VirginAmerica I didn't today... Must mean I n...,Lets Play
3,Bad Flight,Virgin America,jnardino,@VirginAmerica it's really aggressive to blast...,
4,Can't Tell,Virgin America,jnardino,@VirginAmerica and it's a really big bad thing...,


In [41]:
df = data.fillna('-')
new = df.sample(150)
new

Unnamed: 0,negativereason,airline,name,text,tweet_location
2645,-,United,lisahsamuel,@united Thank you so much for your help with m...,"Bellingham, WA"
12849,-,American,murraysm,@AmericanAir No snow in St. Louis. Cold but no...,"St. Louis, Missouri USA"
12905,Late Flight,American,cubexg,@AmericanAir stuck at gate Miami to JFK flight...,New York
7678,-,Delta,BookerWoodfox,@JetBlue is amazing. Had a short delay. They g...,"Lynn, MA"
4846,Customer Service Issue,Southwest,MsNamri,@SouthwestAir why does the customer service ha...,"Las Vegas, NV"
...,...,...,...,...,...
11500,Cancelled Flight,US Airways,lisascott09,@USAirways is useless airways. Day 2 trying to...,-
13094,Cancelled Flight,American,kayliemj,@AmericanAir On hold since 5am. When no one ca...,"Seattle, WA"
14237,-,American,fritzmt,@AmericanAir message and pics sent...,U.S.A.
10947,Can't Tell,US Airways,altadenadad,@USAirways Plus a US Airways - you need to do ...,"Haverford, PA & Wilton Manors"


In [42]:
new.to_json("../data/tweets_new", orient="records")

In [44]:
to_fill = df.sample(150)
to_fill.to_json("../data/tweets_to_fill", orient="records")