# Load datasets to mLab

* https://www.kaggle.com/c/twitter-sentiment-analysis2/discussion
* https://www.kaggle.com/kazanova/sentiment140

In [1]:
import pymongo
import pandas as pd
import numpy as np
import random

In [2]:
def db_connection(collection_name): 
    # connect to mLab DB
    try:
        with open("../credentials/mlab_credentials.txt", 'r', encoding='utf-8') as f:
            [name,password,url,dbname]=f.read().splitlines()
            db_conn = pymongo.MongoClient("mongodb://{}:{}@{}/{}".format(name,password,url,dbname))
            print ("DB connected successfully!!!")
    except pymongo.errors.ConnectionFailure as e:
        print ("Could not connect to DB: %s" % e) 
        
    db = db_conn[dbname]
    collection = db[collection_name]
    
    return collection


def kaggle_id(id):
    return "kaggle_train_" + str(id)


def sentiment140_id(id):
    return "sentiment140_" + str(id)


def random_sentiment(sentiment):
    sentiments_list = [-1,0,1]
    return random.choice(sentiments_list)


def random_location(dummy):
    location = random.choice(locations_list)
    #location = locations_list[0]
    lat = random_location_lat(location)
    lon = random_location_lon(location)
    return [lat,lon]

def random_location_lat(location):
    #location = random.choice(locations_list)
    #location = locations_list[0]
    lat = location["lat_min"] + random.random()*(location["lat_max"] - location["lat_min"])
    return lat

def random_location_lon(location):
    #location = random.choice(locations_list)
    #location = locations_list[0]
    lon = location["lon_min"] + random.random()*(location["lon_max"] - location["lon_min"])
    return lon

In [3]:
db_collection_locations = db_connection("twitter_happiness_locations")
locations_list = [location for location in db_collection_locations.find()]

DB connected successfully!!!


In [4]:
#locations_list

In [5]:
db_collection = db_connection("twitter_happiness_test")

DB connected successfully!!!


In [8]:
result = db_collection.delete_many({})
print(result.deleted_count, " documents deleted")

0  documents deleted


In [9]:
data_kaggle = pd.read_csv(
    "source/kaggle/train.csv",
    encoding='latin-1',
    header=0,
    names=["id_src","sentiment","text"]
)


data_kaggle["id"] = data_kaggle["id_src"].apply(kaggle_id)
data_kaggle["sentiment"] = data_kaggle["sentiment"].apply(random_sentiment)
#data_kaggle["lat"] = data_kaggle["sentiment"].apply(random_location_lat)
#data_kaggle["lon"] = data_kaggle["sentiment"].apply(random_location_lon)
data_kaggle["loc"] = data_kaggle.apply(random_location, axis=1)
data_kaggle["lat"] = data_kaggle["loc"].apply(lambda loc: loc[0])
data_kaggle["lon"] = data_kaggle["loc"].apply(lambda loc: loc[1])


print(data_kaggle.shape)
data_kaggle.head()

(99989, 7)


Unnamed: 0,id_src,sentiment,text,id,loc,lat,lon
0,1,-1,is so sad for my APL frie...,kaggle_train_1,"[29.63702342857413, -95.35537782160773]",29.637023,-95.355378
1,2,1,I missed the New Moon trail...,kaggle_train_2,"[41.483389054936005, 2.2228405047869244]",41.483389,2.222841
2,3,0,omg its already 7:30 :O,kaggle_train_3,"[40.84332881706086, -73.49881006560452]",40.843329,-73.49881
3,4,1,.. Omgaga. Im sooo im gunna CRy. I'...,kaggle_train_4,"[45.604520247677726, -73.77354443218005]",45.60452,-73.773544
4,5,1,i think mi bf is cheating on me!!! ...,kaggle_train_5,"[45.5341333112686, -75.9626843512826]",45.534133,-75.962684


In [10]:
db_collection.insert_many(data_kaggle.to_dict('records'))

<pymongo.results.InsertManyResult at 0x1107f4088>

In [11]:
print(db_collection.count())

99989


  """Entry point for launching an IPython kernel.


In [12]:
data_sentiment140 = pd.read_csv(
    "source/sentiment140/training.1600000.processed.noemoticon.csv",
    encoding='latin-1',
    header=None,
    names=["sentiment","id_src","date","flag","user","text"]
)

data_sentiment140["id"] = data_sentiment140["id_src"].apply(kaggle_id)
data_sentiment140["sentiment"] = data_sentiment140["sentiment"].apply(random_sentiment)
#data_sentiment140["lat"] = data_sentiment140["sentiment"].apply(random_location_lat)
#data_sentiment140["lon"] = data_sentiment140["sentiment"].apply(random_location_lon)
data_sentiment140["loc"] = data_sentiment140.apply(random_location, axis=1)
data_sentiment140["lat"] = data_sentiment140["loc"].apply(lambda loc: loc[0])
data_sentiment140["lon"] = data_sentiment140["loc"].apply(lambda loc: loc[1])

data_sentiment140 = data_sentiment140.head(100000).copy()
print(data_sentiment140.shape)
data_sentiment140.head()

(100000, 10)


Unnamed: 0,sentiment,id_src,date,flag,user,text,id,loc,lat,lon
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",kaggle_train_1467810369,"[41.38687246926971, 2.215406213422864]",41.386872,2.215406
1,1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,kaggle_train_1467810672,"[40.88887706951048, -74.29616650174177]",40.888877,-74.296167
2,-1,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kaggle_train_1467810917,"[38.84174937085038, -77.05194510362819]",38.841749,-77.051945
3,-1,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,kaggle_train_1467811184,"[43.64676078580431, -79.61913359444313]",43.646761,-79.619134
4,-1,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",kaggle_train_1467811193,"[34.245346218483846, -118.88440159932814]",34.245346,-118.884402


In [13]:
db_collection.insert_many(data_sentiment140.to_dict('records'))

<pymongo.results.InsertManyResult at 0x11001fb08>

In [14]:
print(db_collection.count())

199989


  """Entry point for launching an IPython kernel.


In [15]:
for tweet in db_collection.find()[:10]:
    print(tweet)

{'_id': ObjectId('5bf69df1523ec4046518337b'), 'sentiment': 0, 'id_src': 1685029439, 'date': 'Sat May 02 22:49:09 PDT 2009', 'flag': 'NO_QUERY', 'user': 'arizzz', 'text': "i work too much on the weekends. I'd like to blame my single status on that! ", 'id': 'kaggle_train_1685029439', 'loc': [38.87074543189877, -77.05818814834566], 'lat': 38.87074543189877, 'lon': -77.05818814834566}
{'_id': ObjectId('5bf69dbe523ec4046515f763'), 'id_src': 8837, 'sentiment': 1, 'text': " brokeback mountain was terrible.\n8838,0,Sentiment140,# @Catherine42 I wouldn't mind but I only had 1/2 a portion &amp; then left 1/2 the cream  just fruit for me then until my hols  x\n8839,1,Sentiment140,# @DeliciousLunch ... dark chocolate cookies? oh you tease! I'm writing to day n dipping into twitter for company \n8840,1,Sentiment140,# followfriday @mstuyvenberg @feb_unsw @hazelmail @beckescreet - all almost as cool as he-man and she-ra \n8841,1,Sentiment140,# followfriday @presentsqueen because she talks sense \n88