## IMPORT MODULE

In [19]:
import pandas as pd
import requests
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer
from bs4 import BeautifulSoup
import unicodedata
import re
import inflect

# READ TWITTER ENHANCED INTO PANDAS

In [14]:
df_enhanced = pd.read_csv('twitter-archive-enhanced.csv', sep = ',')

In [15]:
df_enhanced.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


# DOWLOAD TWEET IMAGE TSV FILE

## FUNCTION TO DOWNLOAD ANY URL

In [16]:
def download_file(url):
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename

In [17]:
download_file(
    'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')

'image-predictions.tsv'

## GET ADDITIONAL TWITTER DATA

In [18]:

# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

In [None]:

# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = df_enhanced.tweet_id.values
len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)

## READ TWEET JSON TXT FILE INTO PANDAS

In [19]:
total_objects = []
total_dict = []
with open("tweet-json.txt") as file:
    for item in file:
        item = item.strip()
        total_objects.append(item)

In [20]:
for item in total_objects:
    list_items = json.loads(item)
    created_at = list_items['created_at']
    retweet_count = list_items['retweet_count']
    id = list_items['id']
    favorite_count = list_items['favorite_count']

    my_dict = {"TWEETID": [id], "RETWEETCOUNT":[retweet_count], "FAVOURITECOUNT": [favorite_count], "DATECREATED": [created_at]}
    total_dict.append(my_dict)

In [21]:
df_json_txt = pd.DataFrame.from_dict(total_dict)

In [22]:
df_json_txt.head(3)

Unnamed: 0,TWEETID,RETWEETCOUNT,FAVOURITECOUNT,DATECREATED
0,[892420643555336193],[8853],[39467],[Tue Aug 01 16:23:56 +0000 2017]
1,[892177421306343426],[6514],[33819],[Tue Aug 01 00:17:27 +0000 2017]
2,[891815181378084864],[4328],[25461],[Mon Jul 31 00:18:03 +0000 2017]


## DATA QUALITY AND TIDNESS ISSUES

1. Ambigious data
2. Data duplication for some tweets.
3. Over summarized data(Short forms used instead of full spellings)
4. Missing data in some rows and columns. Most cells are empty and therefore cannot perform analysis on them.
5. Wrong spellings forexample I think *pupright*  was meant to mean upright
6. Incomplete words. Forexample h*cking, RT are some of the words that cannot easily be understood in the context that were used.
7. Inconsistend data formats. Words mixed with URLs. Capitalized and Non capitalized words in same text.
8. Useless columns
10. Not each observation forms a row as some rows contain multiple values.

In [23]:
df_new = df_enhanced.copy()

In [24]:
df_new.head(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [26]:
df_new.drop(df_new.columns[[1,2,6,7,8, 13,14,15,16]], axis=1, inplace=True)

In [27]:
df_new.head(5)

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name
0,892420643555336193,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas
1,892177421306343426,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly
2,891815181378084864,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,12,10,Archie
3,891689557279858688,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,13,10,Darla
4,891327558926688256,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin


### TRIM TEXT

In [1]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

### REMOVE SQUARE BRACKETS

In [2]:
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

In [3]:
def denoise_text(text):
    #text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

### REMOVE NON ASCII WORDS

In [5]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

### REMOVE PUNCTUATION

In [9]:
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

### REPLACE NUMBERS WITH WORDS

In [10]:
def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

### DECONTRACT TEXT

In [23]:
def decontracted(Comment):
    # specific
    Comment = re.sub(r"won\'t", "will not", Comment)
    Comment = re.sub(r"shan\'t", "shall not", Comment)
    Comment = re.sub(r"can\'t", "can not",  Comment)
    Comment = re.sub(r"wont", "will not",  Comment)
    Comment = re.sub(r"cant", "can not",  Comment)
    Comment = re.sub(r"shant", "shall not",  Comment)

    # general
    Comment = re.sub(r"n\'t", " not",  Comment)
    Comment = re.sub(r"nt", " not",  Comment)
    Comment = re.sub(r"\'re", " are",  Comment)
    Comment = re.sub(r"\'s", " is",  Comment)
    Comment = re.sub(r"\'d", " would",  Comment)
    Comment = re.sub(r"\'ll", " will",  Comment)
    Comment = re.sub(r"\'t", " not", Comment)
    Comment = re.sub(r"\'ve", " have",  Comment)
    Comment = re.sub(r"\'m", " am",  Comment)
    Comment = re.sub(r"ur", " your",  Comment)
    Comment = re.sub(r"y\'", " you",  Comment)
    Comment = re.sub(r"luv", " love",  Comment)
    Comment = re.sub(r"2", "too",  Comment)
    Comment = re.sub(r"bac", "back",  Comment)
    return Comment

### CLEAN TEXT

In [24]:
def cleantext(Comment):
    Comment = decontracted(Comment)
    Comment = denoise_text(Comment)
    # remove non letters
    Comment = re.sub("[^a-zA-Z]", " ", Comment)
    Comment = remove_non_ascii(Comment)
    Comment = remove_punctuation(Comment)
    Comment = replace_numbers(Comment)
    return Comment

### Test function works

In [34]:
comm = "i won't and shan't"

In [35]:
clean_text = cleantext(comm)

In [36]:
clean_text

['i',
 ' ',
 'w',
 'i',
 'l',
 'l',
 ' ',
 'n',
 'o',
 't',
 ' ',
 'a',
 'n',
 'd',
 ' ',
 's',
 'h',
 'a',
 'l',
 'l',
 ' ',
 'n',
 'o',
 't']

In [37]:

def convert(s):
 
    # initialization of string to ""
    new = ""
 
    # traverse in the string
    for x in s:
        new += x
 
    # return string
    return new
     

### Test function works

In [40]:
convert(clean_text)

'i will not and shall not'