In [3]:
import json
import csv
import pandas as pd
import numpy as np
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

# Training dataset

In [4]:
tweets = pd.read_csv("./datasets/tweets_training_160k.csv", names = ["sentiment", "A", "B", "C", "D", "text"])

### Here we are interested only in the first and the last columns, which have the sentiment, and the tweet respectively

In [5]:
new = tweets.drop(tweets.columns[1:5], axis=1)
new.to_csv('./datasets/tw_sent_160k_train.csv', index=False)

In [6]:
new_tweets = pd.read_csv("./datasets/tw_sent_160k_train.csv",low_memory=False,error_bad_lines=False)
new_tweets.sentiment.value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

### The "clean" function cleans the tweets by removing hastags, web urls, punctuation and words that have only one letter

In [7]:
import re
from nltk.stem import porter
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
import unicodedata

wordnet_lemmatizer = WordNetLemmatizer()
tok = TweetTokenizer()


stop_words = stopwords.words('english')
added = ['.',',','-',';',':','--','\"','(',')', '\'s','?','n\'t', '<', '>',
         '``', '\'\'', 'I', 'i', 'a', 'A', '..', '...', 'i\'m', 'I\'m']
stop_words.extend(added)


def clean(tweet):
    """ Return a list of words """
    
    tweet = BeautifulSoup(tweet, 'lxml').get_text()
    
    tweet = unicodedata.normalize('NFKD', tweet).encode('ascii', 'ignore').decode('utf-8')
    

    # clean hashtags, twitter names, web addresses, puncuation
    tweet = (re.sub(r"#[\w\d]*|@[.]?[\w\d]*[\'\w*]*|https?:\/\/\S+\b|\
             www\.(\w+\.)+\S*|[.,:;!?()$-/^]*", "", tweet).lower())
    

    # strip repeated chars (extra vals)
    tweet = re.sub(r"(.)\1\1{1,1}", "", tweet)
    tweet = (re.sub(r"($.)\1{1,}", "", tweet).split())

    tweet = [tok.tokenize(x) for x in tweet if len(x) > 1 and x not in stop_words]
    
    flat_list = [item for sublist in tweet for item in sublist]
    
    return (" ".join(flat_list)).strip()

### Apply clean to all the tweets

In [None]:
snew = new_tweets["text"].apply(lambda x: (clean(x)))
new_tweets["text"] = snew
new_tweets.head()

### By default tweets with positive sentiment have the sentiment value 4, change this to 1. So 1 for positive tweets and 0 for negative 

In [None]:
df = new_tweets[new_tweets.sentiment == 4]
df["sentiment"] = 1


df2 = new_tweets[new_tweets.sentiment == 0]

cleaned = df2.append(df, ignore_index=True)
cleaned.head()

In [117]:
cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
sentiment    1600000 non-null int64
text         1600000 non-null object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [118]:
cleaned.to_csv('./datasets/clean_sent_160k_train.csv', index=False)

# Apple News articles dataset

In [19]:
news = pd.read_csv('./datasets/news.csv')
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2770 entries, 0 to 2769
Data columns (total 2 columns):
Date    2770 non-null int64
News    2770 non-null object
dtypes: int64(1), object(1)
memory usage: 43.4+ KB


### Clean all the news articles text data

In [20]:
new_arr = []

for x,date in zip(news['News'], news['Date']):
    sample = x.split("|")[:-1]
    new_sam = []
    date = str(date)
    for sam in sample:
        cl = clean(sam)
        new_sam.append(cl)
#     print("|".join(new_sam))
    new_arr.append([date[:4] + "-" +date[4:6] + "-" + date[6:], "|".join(new_sam)])

In [21]:
df_news = pd.DataFrame(new_arr, columns=['Date', 'News'])

In [22]:
df_news.to_csv('./datasets/news_clean.csv', index=False)

In [23]:
final_news = pd.read_csv('./datasets/news_clean.csv')
final_news.head()

Unnamed: 0,Date,News
0,2008-02-01,motorola weighs shift cellphones motorola said...
1,2008-02-04,obama mac clinton pc mac may cooler computer a...
2,2008-02-05,499 buys apple two new products added apples l...
3,2008-02-06,cisco profit shares fall cautious outlook netw...
4,2008-02-07,despite rise profit cisco eases forecast netwo...


# Apple tweets dataset

In [15]:
tweets = pd.read_csv('./datasets/tweets.csv')
tweets.head()

Unnamed: 0,Date,Tweets
0,2008-02-01,"Big, juicy, scrumptious gala apple...|OMG... T..."
1,2008-02-04,Great iPhone SummerBoard theme: Louie Mantia (...
2,2008-02-05,Sending Problem Report for Safari to Apple! SI...
3,2008-02-06,"geez, the ""Proofreader"" in Apple Pages is terr..."
4,2008-02-07,Books for iPhone = nice web-based ebook reader...


### Clean the tweets about apple

In [16]:
new_arr = []

for x,date in zip(tweets['Tweets'], tweets['Date']):
    sample = x.split("|")
    new_sam = []
    date = str(date)
    for sam in sample:
        cl = clean(sam)
        new_sam.append(cl)
#     print("|".join(new_sam))
    new_arr.append([date, "|".join(new_sam)])

  ' that document to Beautiful Soup.' % decoded_markup


In [17]:
df_tweets = pd.DataFrame(new_arr, columns=['Date', 'Tweets'])
df_tweets.to_csv('datasets/tweets_clean.csv', index=False)