# Load datasets to mLab

* https://www.kaggle.com/c/twitter-sentiment-analysis2/discussion
* https://www.kaggle.com/kazanova/sentiment140

In [1]:
import pymongo
import pandas as pd
import numpy as np
import random

In [99]:
def db_connection(collection_name): 
    # connect to mLab DB
    try:
        with open("../credentials/mlab_credentials.txt", 'r', encoding='utf-8') as f:
            [name,password,url,dbname]=f.read().splitlines()
            db_conn = pymongo.MongoClient("mongodb://{}:{}@{}/{}".format(name,password,url,dbname))
            print ("DB connected successfully!!!")
    except pymongo.errors.ConnectionFailure as e:
        print ("Could not connect to DB: %s" % e) 
        
    db = db_conn[dbname]
    collection = db[collection_name]
    
    return collection


def kaggle_id(id):
    return "kaggle_train_" + str(id)


def sentiment140_id(id):
    return "sentiment140_" + str(id)


def random_sentiment(sentiment):
    sentiments_list = [-1,0,1]
    return random.choice(sentiments_list)


def random_location_lat(dummy):
    #location = random.choice(locations_list)
    location = locations_list[0]
    lat = location["lat_min"] + random.random()*(location["lat_max"] - location["lat_min"])
    return lat

def random_location_lon(dummy):
    #location = random.choice(locations_list)
    location = locations_list[0]
    lon = location["lon_min"] + random.random()*(location["lon_max"] - location["lon_min"])
    return lon

In [100]:
db_collection_locations = db_connection("twitter_happiness_locations")
locations_list = [location for location in db_collection_locations.find()]

DB connected successfully!!!


In [102]:
db_collection = db_connection("twitter_happiness_test")

DB connected successfully!!!


In [103]:
result = db_collection.delete_many({})
print(result.deleted_count, " documents deleted")

199989  documents deleted


In [104]:
data_kaggle = pd.read_csv(
    "source/kaggle/train.csv",
    encoding='latin-1',
    header=0,
    names=["id_src","sentiment","text"]
)


data_kaggle["id"] = data_kaggle["id_src"].apply(kaggle_id)
data_kaggle["sentiment"] = data_kaggle["sentiment"].apply(random_sentiment)
data_kaggle["lat"] = data_kaggle["sentiment"].apply(random_location_lat)
data_kaggle["lon"] = data_kaggle["sentiment"].apply(random_location_lon)

print(data_kaggle.shape)
data_kaggle.head()

(99989, 6)


Unnamed: 0,id_src,sentiment,text,id,lat,lon
0,1,-1,is so sad for my APL frie...,kaggle_train_1,41.368761,2.171951
1,2,-1,I missed the New Moon trail...,kaggle_train_2,41.429764,2.149436
2,3,1,omg its already 7:30 :O,kaggle_train_3,41.477446,2.187214
3,4,-1,.. Omgaga. Im sooo im gunna CRy. I'...,kaggle_train_4,41.353495,2.318245
4,5,-1,i think mi bf is cheating on me!!! ...,kaggle_train_5,41.393061,2.03882


In [105]:
db_collection.insert_many(data_kaggle.to_dict('records'))

<pymongo.results.InsertManyResult at 0x10cf01f48>

In [106]:
print(db_collection.count())

99989


  """Entry point for launching an IPython kernel.


In [107]:
data_sentiment140 = pd.read_csv(
    "source/sentiment140/training.1600000.processed.noemoticon.csv",
    encoding='latin-1',
    header=None,
    names=["sentiment","id_src","date","flag","user","text"]
)

data_sentiment140["id"] = data_sentiment140["id_src"].apply(kaggle_id)
data_sentiment140["sentiment"] = data_sentiment140["sentiment"].apply(random_sentiment)
data_sentiment140["lat"] = data_sentiment140["sentiment"].apply(random_location_lat)
data_sentiment140["lon"] = data_sentiment140["sentiment"].apply(random_location_lon)

data_sentiment140 = data_sentiment140.head(100000).copy()
print(data_sentiment140.shape)
data_sentiment140.head()

(100000, 9)


Unnamed: 0,sentiment,id_src,date,flag,user,text,id,lat,lon
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",kaggle_train_1467810369,41.429487,2.068303
1,-1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,kaggle_train_1467810672,41.481117,2.28184
2,1,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kaggle_train_1467810917,41.386288,2.233596
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,kaggle_train_1467811184,41.464487,2.224635
4,1,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",kaggle_train_1467811193,41.393776,2.193467


In [108]:
db_collection.insert_many(data_sentiment140.to_dict('records'))

<pymongo.results.InsertManyResult at 0x129914908>

In [109]:
print(db_collection.count())

199989


  """Entry point for launching an IPython kernel.


In [110]:
for tweet in db_collection.find()[:10]:
    print(tweet)

{'_id': ObjectId('5be4ccd8523ec42211df429c'), 'sentiment': 1, 'id_src': 1678125392, 'date': 'Sat May 02 05:36:03 PDT 2009', 'flag': 'NO_QUERY', 'user': 'TuanaMey', 'text': 'Just had 5 hours sleep  Getting ready to meet with friends now...sun is shining yeay!', 'id': 'kaggle_train_1678125392', 'lat': 41.37048027368166, 'lon': 2.187112108088115}
{'_id': ObjectId('5be4ccbc523ec42211dd1db6'), 'id_src': 8837, 'sentiment': 0, 'text': " brokeback mountain was terrible.\n8838,0,Sentiment140,# @Catherine42 I wouldn't mind but I only had 1/2 a portion &amp; then left 1/2 the cream  just fruit for me then until my hols  x\n8839,1,Sentiment140,# @DeliciousLunch ... dark chocolate cookies? oh you tease! I'm writing to day n dipping into twitter for company \n8840,1,Sentiment140,# followfriday @mstuyvenberg @feb_unsw @hazelmail @beckescreet - all almost as cool as he-man and she-ra \n8841,1,Sentiment140,# followfriday @presentsqueen because she talks sense \n8842,1,Sentiment140,# New York is the mos