In [9]:
import psycopg2
from sklearn import svm
import pandas.io.sql as psql
import psycopg2
import re
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import grid_search
import xgboost as xgb
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

###### Database connection details

In [48]:
import json
with open('/Users/krishna/MOOC/smart-city/login.json') as data_file:
    db = json.load(data_file)

conn = psycopg2.connect(database=db['dbname'], user=db['user'], password=db['password'], host=db['host'], port=db['port'])

In [49]:
dataframe = psql.frame_query("select idd, text, 2*alch_score +3 as alch_score_norm, local_score from twitter.tweets where (2*alch_score+3) - local_score < 1 and (2*alch_score+3 - local_score) > - 1 and alch_score != '0'  ;", conn)

In [50]:
dataframe.head()

Unnamed: 0,idd,text,alch_score_norm,local_score
0,713126956317990916,RT @SireCedric: Quais du Polar (Lyon) mes hora...,1.554584,2
1,715372139143172096,RT @OLAngElles: Suite à sa victoire face à bar...,3.642358,3
2,715377326188535808,RT @RicoMdt185: Les keuf ont la haine comme Ly...,1.554582,2
3,715374761778487297,#Depuis que Lyon s’est éveillé à la Chine en 2...,1.554582,2
4,715389838883561472,Jamal Lyon ❤️,3.588044,3


In [51]:
dataframe.shape

(1437, 4)

In [52]:
def processTweet(tweet):
    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+',' ',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    return tweet

In [53]:
dataframe['text']= dataframe['text'].apply(lambda x : processTweet(x))

In [54]:
dataframe['avg'] = dataframe.apply(lambda x : round((x['alch_score_norm'] + x['local_score'])/2) ,axis=1)

In [55]:
train_data, test_data = train_test_split(dataframe, test_size = 0.2)

In [56]:
print "Size of training data", len(train_data)
print "Size of test data",len(test_data)

Size of training data 1149
Size of test data 288


In [57]:
vectorizer = TfidfVectorizer(min_df=5,
                             max_df = 1.0,
                             sublinear_tf=True,
                             use_idf=True)
train_vectors = vectorizer.fit_transform(train_data['text'])
test_vectors = vectorizer.transform(test_data['text'])

In [58]:
train_vectors

<1149x485 sparse matrix of type '<type 'numpy.float64'>'
	with 12156 stored elements in Compressed Sparse Row format>

In [59]:
parameters = {'kernel':('linear', 'rbf'), 'C':[.1, 10]}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters)

In [60]:
clf.fit(train_vectors, train_data['avg'])

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [0.1, 10]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [71]:
joblib.dump(clf, "ModelObjects/modelsvm2804.pkl") 

['ModelObjects/modelsvm2804.pkl',
 'ModelObjects/modelsvm2804.pkl_01.npy',
 'ModelObjects/modelsvm2804.pkl_02.npy',
 'ModelObjects/modelsvm2804.pkl_03.npy',
 'ModelObjects/modelsvm2804.pkl_04.npy',
 'ModelObjects/modelsvm2804.pkl_05.npy',
 'ModelObjects/modelsvm2804.pkl_06.npy',
 'ModelObjects/modelsvm2804.pkl_07.npy',
 'ModelObjects/modelsvm2804.pkl_08.npy',
 'ModelObjects/modelsvm2804.pkl_09.npy',
 'ModelObjects/modelsvm2804.pkl_10.npy',
 'ModelObjects/modelsvm2804.pkl_11.npy',
 'ModelObjects/modelsvm2804.pkl_12.npy',
 'ModelObjects/modelsvm2804.pkl_13.npy',
 'ModelObjects/modelsvm2804.pkl_14.npy',
 'ModelObjects/modelsvm2804.pkl_15.npy',
 'ModelObjects/modelsvm2804.pkl_16.npy',
 'ModelObjects/modelsvm2804.pkl_17.npy',
 'ModelObjects/modelsvm2804.pkl_18.npy']

In [62]:
joblib.dump(vectorizer, "tfidf.pkl") 

['tfidf.pkl', 'tfidf.pkl_01.npy', 'tfidf.pkl_02.npy']

##### Training Accuracy 

In [63]:
cm_train = (train_data['local_score'],clf.predict(train_vectors))

In [64]:
f1_score(train_data['local_score'], clf.predict(train_vectors), average="macro")

0.97002799789414562

###### Test 

In [65]:
predict = clf.predict(test_vectors)

In [66]:
cm  = (confusion_matrix(test_data['avg'], predict))
print cm

[[ 2  4  0  0  0]
 [ 2 89 12  7  0]
 [ 0 14 39  9  0]
 [ 1 11 11 80  0]
 [ 0  2  0  3  2]]


In [67]:
f1_score(test_data['local_score'], clf.predict(test_vectors), average="macro")

0.60062106350887545

In [68]:
print "Correctly Classified",cm.trace()
print "Total data point ",cm.sum()

Correctly Classified 212
Total data point  288
