# Pickle Load and Dump

In [45]:
import pandas as pd
import sklearn.linear_model
import sqlalchemy
import nltk 


In [46]:
engine = sqlalchemy.engine.create_engine('postgresql://localhost:5432/allenbyron')
#dialect+driver://username:password@host:port/database

df = pd.read_sql_table(table_name='qanda', con=engine)
only_en_df = df[df.lang == 'en']
df_transformed = only_en_df.drop(labels=['contributors',                 
                                    'is_quote_status',              
                                    'in_reply_to_status_id',        
                                    'favorite_count',               
                                    'source',                       
                                    'retweeted',                    
                                    'coordinates',                  
                                    'entities',                     
                                    'in_reply_to_screen_name',      
                                    'id_str',                       
                                    'retweet_count',                
                                    'in_reply_to_user_id',          
                                    'favorited',                    
                                    'retweeted_status',             
                                    'user_',                        
                                    'geo',                          
                                    'in_reply_to_user_id_str',      
                                    'possibly_sensitive',           
                                    'lang',                         
                                    'created_at',                   
                                    'filter_level',                
                                    'in_reply_to_status_id_str', 
                                    'place',                        
                                    'extended_entities',            
                                    'truncated'
                                    ],
                            axis=1
                            )

In [47]:
import sklearn.feature_extraction

count_vec = sklearn.feature_extraction.text.CountVectorizer()
words_array = count_vec.fit_transform(df_transformed.tweet_text)
feature_names = count_vec.get_feature_names()

words_array.shape

(240, 1086)

In [48]:
from nltk.corpus import sentiwordnet
#format: sentiwordnet.senti_synsets('word')[0].pos_score()

sentiment = []

for tweet in df_transformed.tweet_text:
    
    senti_score = 0
    
    for word in nltk.word_tokenize(tweet):
        
        try:
            pos = sentiwordnet.senti_synsets(word)[0].pos_score()
            senti_score += pos
        except:
            senti_score += 0
        
        try:
            neg = sentiwordnet.senti_synsets(word)[0].neg_score()
            senti_score -= neg
        except:
            senti_score += 0
                    
    sentiment.append(senti_score)


In [49]:
ts = sklearn.linear_model.TheilSenRegressor()
ts.fit(words_array.toarray(),sentiment)

ts_prediction = ts.predict(words_array.toarray())

print 'Type: TheilSen'
print 'Mean Absolute Error: ', sklearn.metrics.mean_absolute_error(sentiment,ts_prediction)
print 'Coef: ', ts.coef_
print 'Intercept: ', ts.intercept_

Type: TheilSen
Mean Absolute Error:  1.24327631523e-15
Coef:  [-0.04734702 -0.00703268 -0.00780995 ..., -0.00561939 -0.0214248
 -0.06918134]
Intercept:  -0.0176719440314


In [50]:
import sklearn.cross_validation

In [51]:
(train_x,test_x,train_y,test_y) = sklearn.cross_validation.train_test_split(words_array.toarray(), sentiment, test_size=0.33)

print len(train_x)
print len(test_x)
print len(train_y)
print len(test_y)

160
80
160
80


In [52]:
ts2 = sklearn.linear_model.TheilSenRegressor()
ts2.fit(train_x,train_y)

ts2_prediction = ts2.predict(test_x)

print 'Type: TheilSen'
print 'Mean Absolute Error: ', sklearn.metrics.mean_absolute_error(sentiment,ts_prediction)
print 'Coef: ', ts.coef_
print 'Intercept: ', ts.intercept_

Type: TheilSen
Mean Absolute Error:  1.24327631523e-15
Coef:  [-0.04734702 -0.00703268 -0.00780995 ..., -0.00561939 -0.0214248
 -0.06918134]
Intercept:  -0.0176719440314


In [53]:
import pickle

f = open('ts.pickle', 'w')
f.write(pickle.dumps(ts))
f.close()

In [54]:
test_x_df = pd.DataFrame(data=test_x)
test_x_df.to_csv('test_x_df.csv', index=False)