# Load datasets to mLab

* https://www.kaggle.com/c/twitter-sentiment-analysis2/discussion
* https://www.kaggle.com/kazanova/sentiment140

In [2]:
import pymongo
import pandas as pd
import numpy as np
import random

In [3]:
def db_connection(collection_name): 
    # connect to mLab DB
    try:
        with open("../credentials/mlab_credentials.txt", 'r', encoding='utf-8') as f:
            [name,password,url,dbname]=f.read().splitlines()
            db_conn = pymongo.MongoClient("mongodb://{}:{}@{}/{}".format(name,password,url,dbname))
            print ("DB connected successfully!!!")
    except pymongo.errors.ConnectionFailure as e:
        print ("Could not connect to DB: %s" % e) 
        
    db = db_conn[dbname]
    collection = db[collection_name]
    
    return collection


def kaggle_id(id):
    return "kaggle_train_" + str(id)


def sentiment140_id(id):
    return "sentiment140_" + str(id)


def random_sentiment(sentiment):
    sentiments_list = [-1,0,1]
    return random.choice(sentiments_list)

In [4]:
db_collection = db_connection("twitter_happiness_test")

DB connected successfully!!!


In [5]:
data_kaggle = pd.read_csv(
    "source/kaggle/train.csv",
    encoding='latin-1',
    header=0,
    names=["id_src","sentiment","text"]
)


data_kaggle["id"] = data_kaggle["id_src"].apply(kaggle_id)
data_kaggle["sentiment"] = data_kaggle["sentiment"].apply(random_sentiment)

print(data_kaggle.shape)
data_kaggle.head()

(99989, 4)


Unnamed: 0,id_src,sentiment,text,id
0,1,1,is so sad for my APL frie...,kaggle_train_1
1,2,-1,I missed the New Moon trail...,kaggle_train_2
2,3,0,omg its already 7:30 :O,kaggle_train_3
3,4,-1,.. Omgaga. Im sooo im gunna CRy. I'...,kaggle_train_4
4,5,0,i think mi bf is cheating on me!!! ...,kaggle_train_5


In [6]:
db_collection.insert_many(data_kaggle.to_dict('records'))

<pymongo.results.InsertManyResult at 0x11278abc8>

In [7]:
print(db_collection.count())

99989


  """Entry point for launching an IPython kernel.


In [8]:
data_sentiment140 = pd.read_csv(
    "source/sentiment140/training.1600000.processed.noemoticon.csv",
    encoding='latin-1',
    header=None,
    names=["sentiment","id_src","date","flag","user","text"]
)

data_sentiment140["id"] = data_sentiment140["id_src"].apply(kaggle_id)
data_sentiment140["sentiment"] = data_sentiment140["sentiment"].apply(random_sentiment)

data_sentiment140 = data_sentiment140.head(100000).copy()
print(data_sentiment140.shape)
data_sentiment140.head()

(100000, 7)


Unnamed: 0,sentiment,id_src,date,flag,user,text,id
0,1,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",kaggle_train_1467810369
1,1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,kaggle_train_1467810672
2,-1,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kaggle_train_1467810917
3,-1,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,kaggle_train_1467811184
4,-1,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",kaggle_train_1467811193


In [9]:
db_collection.insert_many(data_sentiment140.to_dict('records'))

<pymongo.results.InsertManyResult at 0x123cabb88>

In [10]:
print(db_collection.count())

199989


  """Entry point for launching an IPython kernel.


In [11]:
for tweet in db_collection.find()[:10]:
    print(tweet)

{'_id': ObjectId('5be490e3523ec416b1238831'), 'id_src': 1, 'sentiment': 1, 'text': '                     is so sad for my APL friend.............', 'id': 'kaggle_train_1'}
{'_id': ObjectId('5be490e3523ec416b1238832'), 'id_src': 2, 'sentiment': -1, 'text': '                   I missed the New Moon trailer...', 'id': 'kaggle_train_2'}
{'_id': ObjectId('5be490e3523ec416b1238833'), 'id_src': 3, 'sentiment': 0, 'text': '              omg its already 7:30 :O', 'id': 'kaggle_train_3'}
{'_id': ObjectId('5be490e3523ec416b1238834'), 'id_src': 4, 'sentiment': -1, 'text': "          .. Omgaga. Im sooo  im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)...", 'id': 'kaggle_train_4'}
{'_id': ObjectId('5be490e3523ec416b1238835'), 'id_src': 5, 'sentiment': 0, 'text': '         i think mi bf is cheating on me!!!       T_T', 'id': 'kaggle_train_5'}
{'_id': ObjectId('5be490e3523ec416b1238836'), 'id_src': 6, 'sentiment': -1, 'text': '         or i just worry