# Pickle Load and Dump

In [4]:
import pandas as pd
import sklearn.linear_model
import sqlalchemy
import nltk 


In [6]:
engine = sqlalchemy.engine.create_engine('postgresql://localhost:5432/allenbyron')
#dialect+driver://username:password@host:port/database

df = pd.read_sql_table(table_name='qanda', con=engine)
only_en_df = df[df.lang == 'en']
df_transformed = only_en_df.drop(labels=['contributors',                 
                                    'is_quote_status',              
                                    'in_reply_to_status_id',        
                                    'favorite_count',               
                                    'source',                       
                                    'retweeted',                    
                                    'coordinates',                  
                                    'entities',                     
                                    'in_reply_to_screen_name',      
                                    'id_str',                       
                                    'retweet_count',                
                                    'in_reply_to_user_id',          
                                    'favorited',                    
                                    'retweeted_status',             
                                    'user_',                        
                                    'geo',                          
                                    'in_reply_to_user_id_str',      
                                    'possibly_sensitive',           
                                    'lang',                         
                                    'created_at',                   
                                    'filter_level',                
                                    'in_reply_to_status_id_str', 
                                    'place',                        
                                    'extended_entities',            
                                    'truncated'
                                    ],
                            axis=1
                            )

In [9]:
import sklearn.feature_extraction

count_vec = sklearn.feature_extraction.text.CountVectorizer()
words_array = count_vec.fit_transform(df_transformed.tweet_text)
feature_names = count_vec.get_feature_names()

words_array.shape

(186, 941)

In [10]:
from nltk.corpus import sentiwordnet
#format: sentiwordnet.senti_synsets('word')[0].pos_score()

sentiment = []

for tweet in df_transformed.tweet_text:
    
    senti_score = 0
    
    for word in nltk.word_tokenize(tweet):
        
        try:
            pos = sentiwordnet.senti_synsets(word)[0].pos_score()
            senti_score += pos
        except:
            senti_score += 0
        
        try:
            neg = sentiwordnet.senti_synsets(word)[0].neg_score()
            senti_score -= neg
        except:
            senti_score += 0
                    
    sentiment.append(senti_score)


In [19]:
ts = sklearn.linear_model.TheilSenRegressor()
ts.fit(words_array.toarray(),sentiment)

ts_prediction = ts.predict(words_array.toarray())

print 'Type: TheilSen'
print 'Mean Absolute Error: ', sklearn.metrics.mean_absolute_error(sentiment,ts_prediction)
print 'Coef: ', ts.coef_
print 'Intercept: ', ts.intercept_

Type: TheilSen
Mean Absolute Error:  1.07239486108e-15
Coef:  [ -8.69107203e-03  -2.46710635e-02  -9.52198824e-03   1.11634722e-02
   7.72711755e-03   2.86354568e-02   9.37861159e-03  -8.69107203e-03
   2.82424814e-02  -3.10355459e-02  -5.30898156e-02   9.05482218e-02
  -9.00436642e-03  -2.46710635e-02  -2.22673643e-02  -3.53600216e-02
   9.04778570e-02   1.21598907e-02  -4.01448769e-02   4.10481032e-02
  -5.21642989e-03   4.29728788e-02  -7.64698118e-03  -1.54636177e-02
   8.78362249e-02   8.70919130e-02  -2.28708198e-03   3.29457643e-03
   9.37861159e-03   5.76264517e-03  -7.37926889e-02   1.44640386e-02
  -3.63787806e-03  -1.34124537e-02   3.65066089e-02   5.76264517e-03
   6.00920797e-02   2.16899563e-02   2.16899563e-02   3.29457643e-03
  -2.49095877e-02   1.11634722e-02   2.63467739e-02  -2.39713186e-02
  -7.37400353e-02  -4.24558646e-02  -5.89176455e-02  -8.85971093e-02
   1.54542351e-02   1.28165624e-02   1.54542351e-02   4.89961309e-03
  -9.72157046e-02   1.50780439e-02   3.20

In [25]:
import pickle

f = open('QandA_TheilSenRegressor.py', 'w')
f.write(pickle.dumps(ts))
f.close()