## Getting Sentiment140 Data

In [2]:
import pandas as pd

data = pd.read_csv(r"../new_data/training1600000.csv", engine='python')

In [3]:
data.shape

(1599999, 6)

In [4]:
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [5]:
data.tail()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1599994,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599998,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [6]:
data.columns

Index(['0', '1467810369', 'Mon Apr 06 22:19:45 PDT 2009', 'NO_QUERY',
       '_TheSpecialOne_',
       '@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D'],
      dtype='object')

In [7]:
data = data.drop(data.columns[1:5], axis=1)

In [8]:
data.head()

Unnamed: 0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [10]:
X = data[data.columns[1]].values.tolist()
X[0]

"is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!"

In [11]:
Y = data[data.columns[0]].values.tolist()
Y[0]

0

## Shuffle Data

In [12]:
from sklearn.utils import shuffle

X, Y = shuffle(X, Y, random_state=0)
X[0]

'Happy birthday, sister! '

In [13]:
Y[0]

4

## Data Preprocessing

In [14]:
# remove numbers
import re

def remove_numbers(post):
    return re.sub(r'\d+', '', post)

In [15]:
X = [remove_numbers(p) for p in X]

In [16]:
# remove punctuations
import string

def remove_punctuation(post):
    translator = str.maketrans('', '', string.punctuation)
    return post.translate(translator)

In [17]:
X = [remove_punctuation(p) for p in X]

In [18]:
# remove whitespaces
def remove_whitespace(post):
    return " ".join(post.split())

In [19]:
X = [remove_whitespace(p) for p in X]

In [20]:
# get stop word lists
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kanglantang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
# create an analyzer that can remove stop words and get stemmed forms of words
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

stemmer = WordNetLemmatizer()
analyzer = CountVectorizer().build_analyzer()

def tokenizer(post):
    return (stemmer.lemmatize(w, pos='v') for w in analyzer(post) if w not in stop_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kanglantang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
# create vectorizer
vectorizer = CountVectorizer(stop_words='english', min_df=0.00001, lowercase=True, 
                             decode_error='ignore', analyzer=tokenizer, max_features=5000)

In [29]:
X_matrix = vectorizer.fit_transform(X).toarray()

In [30]:
X_matrix.shape

(1599999, 5000)

In [31]:
vectorizer.get_feature_names()

['aa',
 'aaah',
 'aah',
 'aaron',
 'ab',
 'abandon',
 'abby',
 'abc',
 'ability',
 'abit',
 'able',
 'abs',
 'absolute',
 'absolutely',
 'abt',
 'abuse',
 'ac',
 'accent',
 'accept',
 'access',
 'accident',
 'accidentally',
 'accomplish',
 'accord',
 'account',
 'acct',
 'ace',
 'ache',
 'ack',
 'acoustic',
 'across',
 'act',
 'action',
 'active',
 'activity',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'ada',
 'adam',
 'adams',
 'add',
 'addict',
 'addiction',
 'addictive',
 'address',
 'adjust',
 'admin',
 'admire',
 'admit',
 'adobe',
 'adopt',
 'adorable',
 'adore',
 'ads',
 'adult',
 'advance',
 'advantage',
 'adventure',
 'advert',
 'advertise',
 'advice',
 'affect',
 'afford',
 'afraid',
 'africa',
 'afternoon',
 'afterwards',
 'age',
 'agenda',
 'agent',
 'agh',
 'ago',
 'agree',
 'ah',
 'aha',
 'ahah',
 'ahaha',
 'ahahaha',
 'ahead',
 'ahh',
 'ahhh',
 'ahhhh',
 'ahhhhh',
 'ahhhhhh',
 'ai',
 'aid',
 'aim',
 'aint',
 'air',
 'airport',
 'aj',
 'aka',
 'ako',
 'aku',
 '

## Convert to Sparse Format

In [34]:
from scipy.sparse import csr_matrix

In [35]:
X_sparse = csr_matrix(X_matrix[:10000])

## Store Preprocessed Data

In [36]:
df_features = pd.DataFrame(vectorizer.get_feature_names())
df_features.to_csv('../new_data/features.csv', index=False)

df_Y = pd.DataFrame(Y[:10000])
df_Y.to_csv('../new_data/Y.csv',index=False)

# df_X = pd.DataFrame(X_sparse)
# df_X.to_csv('../new_data/X_sparse.csv',index=False)

In [68]:
import scipy.sparse

scipy.sparse.save_npz('../new_data/X_sparse.npz', X_sparse)

In [69]:
#load = scipy.sparse.load_npz('../new_data/X_sparse.npz')