In [1]:
import pandas as pd
import numpy as np

In [2]:
twitter_archive_df = pd.read_csv('twitter-archive-enhanced.csv')

In [None]:
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = twitter_archive_df.tweet_id.values
len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)

1: 892420643555336193
Fail
2: 892177421306343426


In [3]:
image_predictions_df = pd.read_csv('image_predictions.tsv', sep='\t')

In [4]:
json_data = pd.read_json('tweet-json.txt', lines=True)

In [5]:
final_json_data = json_data[['id', 'retweet_count', 'favorite_count']]

In [42]:
image_predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [43]:
final_json_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   id              2354 non-null   int64
 1   retweet_count   2354 non-null   int64
 2   favorite_count  2354 non-null   int64
dtypes: int64(3)
memory usage: 55.3 KB


## Data Assessment - Quality Issues (8)

#### Twitter Archive Dataframe
- tweet_id column is integer datatype
- in_reply_to_status_id is float datatype 
- in_reply_to_user_id is float datatype
- timestamp is object datatype
- retweeted_status_id is float datatype
- retweeted_status_user_id is float datatype
- one of the denominators has a rating of "0"
- many of the dog names are "none" or they don't make sense


In [7]:
twitter_archive_df.sample(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
305,836260088725786625,,,2017-02-27 17:01:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Lucy. She spent all morning overseeing...,,,,https://twitter.com/dog_rates/status/836260088...,13,10,Lucy,,,,
243,846139713627017216,,,2017-03-26 23:20:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",SHE DID AN ICY ZOOM AND KNEW WHEN TO PUT ON TH...,,,,https://twitter.com/csncapitals/status/8460884...,13,10,,,,,


In [7]:
twitter_archive_df['rating_denominator'].value_counts()

10     2333
11        3
50        3
20        2
80        2
0         1
120       1
7         1
170       1
150       1
130       1
90        1
110       1
2         1
70        1
40        1
16        1
15        1
Name: rating_denominator, dtype: int64

In [10]:
twitter_archive_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [8]:
# twitter_archive_df.loc[twitter_archive_df['rating_denominator'] != 10]

In [11]:
twitter_archive_df['name'].value_counts()

None        745
a            55
Charlie      12
Lucy         11
Cooper       11
           ... 
Chadrick      1
Tayzie        1
Ike           1
Petrick       1
Napolean      1
Name: name, Length: 957, dtype: int64

In [12]:
twitter_archive_df[twitter_archive_df["name"].duplicated()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
7,890729181411237888,,,2017-07-28 00:22:40 +0000,"<a href=""http://twitter.com/download/iphone"" r...",When you watch your owner call another dog a g...,,,,https://twitter.com/dog_rates/status/890729181...,13,10,,,,,
12,889665388333682689,,,2017-07-25 01:55:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a puppo that seems to be on the fence a...,,,,https://twitter.com/dog_rates/status/889665388...,13,10,,,,,puppo
23,887473957103951883,,,2017-07-19 00:47:34 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Canela. She attempted some fancy porch...,,,,https://twitter.com/dog_rates/status/887473957...,13,10,Canela,,,,
24,887343217045368832,,,2017-07-18 16:08:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",You may not have known you needed to see this ...,,,,https://twitter.com/dog_rates/status/887343217...,13,10,,,,,
25,887101392804085760,,,2017-07-18 00:07:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This... is a Jubilant Antarctic House Bear. We...,,,,https://twitter.com/dog_rates/status/887101392...,12,10,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2351,666049248165822465,,,2015-11-16 00:24:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a 1949 1st generation vulpix. Enj...,,,,https://twitter.com/dog_rates/status/666049248...,5,10,,,,,
2352,666044226329800704,,,2015-11-16 00:04:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a purebred Piers Morgan. Loves to Netf...,,,,https://twitter.com/dog_rates/status/666044226...,6,10,a,,,,
2353,666033412701032449,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a very happy pup. Big fan of well-main...,,,,https://twitter.com/dog_rates/status/666033412...,9,10,a,,,,
2354,666029285002620928,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,,,,https://twitter.com/dog_rates/status/666029285...,7,10,a,,,,


In [11]:
twitter_archive_df.sample(10)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
884,760190180481531904,,,2016-08-01 19:07:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Sadie. She's addicted to balloons. It's t...,,,,https://twitter.com/dog_rates/status/760190180...,10,10,Sadie,,,,
1216,714982300363173890,,,2016-03-30 01:07:18 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Calbert. He forgot to clear his Google...,,,,https://twitter.com/dog_rates/status/714982300...,9,10,Calbert,,,,
1833,676146341966438401,,,2015-12-13 21:07:04 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Bert. He likes flowers. 10/10 https://...,,,,https://twitter.com/dog_rates/status/676146341...,10,10,Bert,,,,
1000,747963614829678593,,,2016-06-29 01:23:16 +0000,"<a href=""http://twitter.com/download/iphone"" r...",PUPPER NOOOOO BEHIND YOUUU 10/10 pls keep this...,,,,https://twitter.com/dog_rates/status/747963614...,10,10,,,,pupper,
701,786051337297522688,7.72743e+17,7.30505e+17,2016-10-12 03:50:17 +0000,"<a href=""http://twitter.com/download/iphone"" r...",13/10 for breakdancing puppo @shibbnbot,,,,,13,10,,,,,puppo
1905,674606911342424069,6.744689e+17,4196984000.0,2015-12-09 15:09:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",The 13/10 also takes into account this impecca...,,,,,13,10,,,,,
2148,669749430875258880,,,2015-11-26 05:28:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to Clarence. Clarence thought he saw...,,,,https://twitter.com/dog_rates/status/669749430...,8,10,Clarence,,,,
2227,668268907921326080,,,2015-11-22 03:24:58 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have an Azerbaijani Buttermilk named G...,,,,https://twitter.com/dog_rates/status/668268907...,10,10,,,,,
664,790723298204217344,,,2016-10-25 01:14:59 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Happy. He's a bathtub r...,7.899865e+17,4196984000.0,2016-10-23 00:27:05 +0000,https://twitter.com/dog_rates/status/789986466...,12,10,Happy,,,,
223,849412302885593088,,,2017-04-05 00:04:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Noosh. He noticed you were in the show...,,,,https://twitter.com/dog_rates/status/849412302...,12,10,Noosh,,,,


## Data Assessment - Tidiness Issues (2)
- dog stage should be a single column, not 4
- all three tables can be joined into one


## Cleaning

#### Define


- change tweet_id column to object datatype
- change in_reply_to_status_id to object datatype 
- change in_reply_to_user_id to object datatype
- change timestamp to datetime datatype
- change retweeted_status_id to object datatype
- change retweeted_status_user_id to object datatype
- for the row where denominator has a rating of "0", change it "10"
- convert rows (where the dog name is 'None') to np.nan
- convert rows (where the dog name begins in lower case) to np.nan
- merge dog stage columns into one column
- merge all tables into one master table


#### Code

In [6]:
# 0. make a copy of the dataframe we want to clean, twitter_archive_df
twitter_archive_df_clean = twitter_archive_df.copy()

In [7]:
# 1. change id columns to object datatype
twitter_archive_df_clean = twitter_archive_df_clean.astype({"tweet_id": object,
                "in_reply_to_status_id": object,
                "in_reply_to_user_id": object,
                "retweeted_status_id": object,
                "retweeted_status_user_id": object})

In [8]:
# 2. change timestamp column to datetime datatype
twitter_archive_df_clean["timestamp"]= pd.to_datetime(twitter_archive_df_clean["timestamp"])

In [10]:
# 3. for the row where denominator has a rating of "0", change it "10"
twitter_archive_df_clean.at[313,'rating_denominator']=10

In [11]:
# 4. convert rows (where the dog name begins in lower case) to np.nan
# first make a mask
mask = twitter_archive_df_clean.name.str.contains('^[a-z]', regex = True)

# then change lower case names to nan
twitter_archive_df_clean.loc[mask,'name'] = np.nan

In [12]:
# 5. convert rows (where the dog name is 'None') to np.nan
twitter_archive_df_clean.name.replace('None', np.nan, inplace=True)

In [None]:
# 6. merge dog stage columns into one column

In [None]:
# first, we change cells with 'None' to ''
# then, make the new column equal to old column1 + old column2.. etc
# finally drop the old columns, and deal with multivalue cells

In [13]:
twitter_archive_df_clean.doggo.replace('None', '', inplace=True)
twitter_archive_df_clean.floofer.replace('None', '', inplace=True)
twitter_archive_df_clean.pupper.replace('None', '', inplace=True)
twitter_archive_df_clean.puppo.replace('None', '', inplace=True)

In [14]:
twitter_archive_df_clean['stage'] = twitter_archive_df_clean.doggo + \
    twitter_archive_df_clean.floofer + \
    twitter_archive_df_clean.pupper + \
    twitter_archive_df_clean.puppo

In [20]:
# check on the new column
twitter_archive_df_clean.stage.value_counts()

                1976
pupper           245
doggo             83
puppo             29
doggopupper       12
floofer            9
doggopuppo         1
doggofloofer       1
Name: stage, dtype: int64

In [40]:
twitter_archive_df_clean.stage.value_counts()

                1976
pupper           245
doggo             83
puppo             29
doggopupper       12
floofer            9
doggopuppo         1
doggofloofer       1
Name: stage, dtype: int64

In [14]:

# deal with the rows with multiple stage values
twitter_archive_df_clean.loc[twitter_archive_df_clean.stage == 'doggopupper'] = 'doggo,pupper'
twitter_archive_df_clean.loc[twitter_archive_df_clean.stage == 'doggofloofer'] = 'doggo,fluffer'
twitter_archive_df_clean.loc[twitter_archive_df_clean.stage == 'doggopuppo'] = 'doggo,puppo'


In [15]:
twitter_archive_df_clean.stage.value_counts()

                 1976
pupper            245
doggo              83
puppo              29
doggo,pupper       12
floofer             9
doggo,fluffer       1
doggo,puppo         1
Name: stage, dtype: int64

In [62]:
twitter_archive_df_clean.stage.value_counts()

pupper           245
doggo             83
puppo             29
doggo,pupper      12
floofer            9
doggo,fluffer      1
doggo,puppo        1
Name: stage, dtype: int64

In [15]:
# convert blank name cells to np.nan...but it seems like this turns everything into nan

twitter_archive_df_clean.stage.replace('', np.nan, inplace=True)

#old code...twitter_archive_df_clean.loc[twitter_archive_df_clean.stage == ''] = np.nan

In [16]:
# drop old columns
twitter_archive_df_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1, inplace=True)

In [None]:
# 7. merge all tables into one master table

In [17]:
# first change tweet id columns to object datatype (in the other tables)
final_json_data = final_json_data.astype({'id': 'object'})
image_predictions_df = image_predictions_df.astype({'tweet_id': 'object'})


In [18]:
# for consistency, change 'id' in json_data_df to 'tweet_id'
final_json_data.rename(columns = {'id':'tweet_id'}, inplace = True)

In [19]:
# merge final_json_data and image_predictions_df
result_1 = pd.merge(image_predictions_df,final_json_data,on='tweet_id')
result_1.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,retweet_count,favorite_count
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True,532,2535
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True,48,132
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True,47,128
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True,147,311
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True,41,111


In [20]:
# merge twitter_archive_df_clean and 'result_1' into master_df
master_df = pd.merge(twitter_archive_df_clean,result_1,on='tweet_id')


In [21]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2073 entries, 0 to 2072
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2073 non-null   object             
 1   in_reply_to_status_id       23 non-null     object             
 2   in_reply_to_user_id         23 non-null     object             
 3   timestamp                   2073 non-null   datetime64[ns, UTC]
 4   source                      2073 non-null   object             
 5   text                        2073 non-null   object             
 6   retweeted_status_id         79 non-null     object             
 7   retweeted_status_user_id    79 non-null     object             
 8   retweeted_status_timestamp  79 non-null     object             
 9   expanded_urls               2073 non-null   object             
 10  rating_numerator            2073 non-null   int64           

In [22]:
twitter_archive_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   tweet_id                    2356 non-null   object
 1   in_reply_to_status_id       91 non-null     object
 2   in_reply_to_user_id         91 non-null     object
 3   timestamp                   2356 non-null   object
 4   source                      2356 non-null   object
 5   text                        2356 non-null   object
 6   retweeted_status_id         193 non-null    object
 7   retweeted_status_user_id    193 non-null    object
 8   retweeted_status_timestamp  193 non-null    object
 9   expanded_urls               2297 non-null   object
 10  rating_numerator            2356 non-null   object
 11  rating_denominator          2356 non-null   object
 12  name                        1512 non-null   object
 13  stage                       380 non-null    obje

In [37]:
master_df.sample(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,retweet_count,favorite_count
865,740365076218183684,,,2016-06-08 02:09:24+00:00,"<a href=""http://twitter.com/download/iphone"" r...",When the photographer forgets to tell you wher...,,,,https://twitter.com/dog_rates/status/740365076...,...,0.246313,False,Windsor_tie,0.172446,False,mushroom,0.137516,False,495,2727
737,756998049151549440,,,2016-07-23 23:42:53+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Oliver. He's an English Creamschnitzel...,,,,https://twitter.com/dog_rates/status/756998049...,...,0.678555,True,Labrador_retriever,0.072632,True,Border_terrier,0.049033,True,2271,6923
149,857263160327368704,,,2017-04-26 16:00:39+00:00,"<a href=""http://twitter.com/download/iphone"" r...",This is Oscar and Oliver. Oliver shrunk Oscar....,,,,https://twitter.com/dog_rates/status/857263160...,...,0.998021,True,Pomeranian,0.000922,True,keeshond,0.000311,True,4934,21041


In [38]:
master_df.tweet_id.duplicated().sum()

0

#### Test

In [None]:
# 0. 
twitter_archive_df_clean.head(3)

In [17]:
# 1. change id columns to object datatype
print(twitter_archive_df_clean.dtypes)

tweet_id                      object
in_reply_to_status_id         object
in_reply_to_user_id           object
timestamp                     object
source                        object
text                          object
retweeted_status_id           object
retweeted_status_user_id      object
retweeted_status_timestamp    object
expanded_urls                 object
rating_numerator               int64
rating_denominator             int64
name                          object
doggo                         object
floofer                       object
pupper                        object
puppo                         object
dtype: object


In [14]:
2.
twitter_archive_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   tweet_id                    2356 non-null   object             
 1   in_reply_to_status_id       78 non-null     object             
 2   in_reply_to_user_id         78 non-null     object             
 3   timestamp                   2356 non-null   datetime64[ns, UTC]
 4   source                      2356 non-null   object             
 5   text                        2356 non-null   object             
 6   retweeted_status_id         181 non-null    object             
 7   retweeted_status_user_id    181 non-null    object             
 8   retweeted_status_timestamp  181 non-null    object             
 9   expanded_urls               2297 non-null   object             
 10  rating_numerator            2356 non-null   int64           

In [8]:
# 3. for the row where denominator has a rating of "0", change it "10"
twitter_archive_df_clean.loc[[313]]

NameError: name 'twitter_archive_df_clean' is not defined

In [13]:
# 4. convert rows (where the dog name is 'None') to np.nan
twitter_archive_df_clean.loc[twitter_archive_df_clean['name']=="None"]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


In [30]:
# 6
twitter_archive_df_clean[['name','stage']].sample(5)

Unnamed: 0,name,stage
1369,Rudy,
1145,Karll,
1262,Tater,
1995,Scott,pupper
1268,Cecil,


In [54]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2060 entries, 0 to 2059
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2060 non-null   object 
 1   in_reply_to_status_id       22 non-null     object 
 2   in_reply_to_user_id         22 non-null     object 
 3   timestamp                   2060 non-null   object 
 4   source                      2060 non-null   object 
 5   text                        2060 non-null   object 
 6   retweeted_status_id         77 non-null     object 
 7   retweeted_status_user_id    77 non-null     object 
 8   retweeted_status_timestamp  77 non-null     object 
 9   expanded_urls               2060 non-null   object 
 10  rating_numerator            2060 non-null   object 
 11  rating_denominator          2060 non-null   object 
 12  name                        1392 non-null   object 
 13  stage                       307 n

## Export the master dataframe

In [22]:
# saving the DataFrame as a CSV file
test_file = master_df.to_csv('twitter_archive_master.csv', index = False)

In [23]:
# saving the DataFrame to SQLite db
import sqlite3

conn = sqlite3.connect('twitter_archive_master.db') 
master_df.to_sql('master_table', conn, if_exists='replace')

## Analysis/Insights