In [38]:
import psycopg2
from sklearn import svm
import pandas.io.sql as psql
import re
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import grid_search
import xgboost as xgb
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

###### Database connection details

In [42]:
conn = psycopg2.connect(database="<db>", user="<usrname>", password="<pwd>", host="<host>", port="<port>")

In [4]:
dataframe = psql.frame_query("SELECT idd, text, 2*alch_score+3 as alch_score_norm, local_score  FROM tweets.training1;", conn)



In [5]:
dataframe.head()

Unnamed: 0,idd,text,alch_score_norm,local_score
0,713290412480446464,Mercredi 13 j'vais à Lyon tout cosy,3.52785,4
1,713282956899385344,"Escalade au Mur de Lyon, pour enfants et adult...",3.589236,3
2,713311565504724992,RT @JpVacher: Le retour d'un événement tennis ...,1.554584,1
3,713280928391565312,@Anthony_Fcp @QBINZE a deux doigts je prend un...,3.678514,4
4,713299698161946624,@LeColonelActus Un peu d'eau pour votre moulin...,3.84093,4


In [6]:
def processTweet(tweet):
    # process the tweets

    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+',' ',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    return tweet

In [7]:
dataframe['text']= dataframe['text'].apply(lambda x : processTweet(x))

In [20]:
dataframe['avg'] = dataframe.apply(lambda x : round((x['alch_score_norm'] + x['local_score'])/2) ,axis=1)

In [21]:
train_data, test_data = train_test_split(dataframe, test_size = 0.2)

In [36]:
print "Size of training data", len(train_data)
print "Size of test data",len(test_data)

Size of training data 377
Size of test data 95


In [23]:
vectorizer = TfidfVectorizer(min_df=5,
                             max_df = 1.0,
                             sublinear_tf=True,
                             use_idf=True)
train_vectors = vectorizer.fit_transform(train_data['text'])
test_vectors = vectorizer.transform(test_data['text'])

In [24]:
train_vectors

<377x174 sparse matrix of type '<type 'numpy.float64'>'
	with 3005 stored elements in Compressed Sparse Row format>

In [25]:
parameters = {'kernel':('linear', 'rbf'), 'C':[.1, 10]}
svr = svm.SVC()
clf = grid_search.GridSearchCV(svr, parameters)

In [26]:
clf.fit(train_vectors, train_data['avg'])

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'kernel': ('linear', 'rbf'), 'C': [0.1, 10]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [39]:
joblib.dump(clf, "svm.pkl") 

['svm.pkl',
 'svm.pkl_01.npy',
 'svm.pkl_02.npy',
 'svm.pkl_03.npy',
 'svm.pkl_04.npy',
 'svm.pkl_05.npy',
 'svm.pkl_06.npy',
 'svm.pkl_07.npy',
 'svm.pkl_08.npy',
 'svm.pkl_09.npy',
 'svm.pkl_10.npy',
 'svm.pkl_11.npy',
 'svm.pkl_12.npy',
 'svm.pkl_13.npy',
 'svm.pkl_14.npy',
 'svm.pkl_15.npy',
 'svm.pkl_16.npy',
 'svm.pkl_17.npy',
 'svm.pkl_18.npy']

In [40]:
joblib.dump(vectorizer, "tfidf.pkl") 

['tfidf.pkl', 'tfidf.pkl_01.npy', 'tfidf.pkl_02.npy']

##### Training Accuracy 

In [30]:
cm_train = (train_data['local_score'],clf.predict(train_vectors))

In [31]:
f1_score(train_data['local_score'], clf.predict(train_vectors), average="macro")

0.95903208142819596

###### Test 

In [32]:
predict = clf.predict(test_vectors)

In [33]:
cm  = (confusion_matrix(test_data['avg'], predict))
print cm

[[ 1  1  0  0]
 [ 0 28  5  4]
 [ 0  6 15  5]
 [ 2  9  2 17]]


In [34]:
f1_score(test_data['local_score'], clf.predict(test_vectors), average="macro")

0.58087522045855378

In [35]:
print "Correctly Classified",cm.trace()
print "Total data point ",cm.sum()

Correctly Classified 61
Total data point  95
