In [1]:
import json 
import pandas as pd 

## Load Data

In [4]:
# Initialize an empty list to hold each JSON object
data = []

# Read the file line by line
with open('data_and_files/random_tweets.json', 'r') as file:
    for line in file:
        try:
            # Decode each line as a JSON object and append to the list
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

# Convert list of JSON objects to pandas DataFrame
df = pd.DataFrame(data)

# Display the first few rows
print(df.head())

                       created_at                   id               id_str  \
0  Tue Jul 31 13:34:40 +0000 2018  1024287229525598210  1024287229525598210   
1  Tue Jul 31 13:34:40 +0000 2018  1024287229512953856  1024287229512953856   
2  Tue Jul 31 13:34:40 +0000 2018  1024287229504569344  1024287229504569344   
3  Tue Jul 31 13:34:40 +0000 2018  1024287229496029190  1024287229496029190   
4  Tue Jul 31 13:34:40 +0000 2018  1024287229492031490  1024287229492031490   

                                                text  truncated  \
0  RT @KWWLStormTrack7: We are more than a month ...      False   
1  @hail_ee23 Thanks love its just the feeling of...      False   
2  RT @TransMediaWatch: Pink News has more on the...      False   
3  RT @realDonaldTrump: One of the reasons we nee...      False   
4  RT @First5App: This hearing of His Word doesn’...      False   

                                            entities  \
0  {'hashtags': [], 'symbols': [], 'user_mentions...   
1  {'hasht

## Convert data 

In [6]:
#transform to datetime 
df['created_at'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S %z %Y')
print(df.head())

                 created_at                   id               id_str  \
0 2018-07-31 13:34:40+00:00  1024287229525598210  1024287229525598210   
1 2018-07-31 13:34:40+00:00  1024287229512953856  1024287229512953856   
2 2018-07-31 13:34:40+00:00  1024287229504569344  1024287229504569344   
3 2018-07-31 13:34:40+00:00  1024287229496029190  1024287229496029190   
4 2018-07-31 13:34:40+00:00  1024287229492031490  1024287229492031490   

                                                text  truncated  \
0  RT @KWWLStormTrack7: We are more than a month ...      False   
1  @hail_ee23 Thanks love its just the feeling of...      False   
2  RT @TransMediaWatch: Pink News has more on the...      False   
3  RT @realDonaldTrump: One of the reasons we nee...      False   
4  RT @First5App: This hearing of His Word doesn’...      False   

                                            entities  \
0  {'hashtags': [], 'symbols': [], 'user_mentions...   
1  {'hashtags': [], 'symbols': [], 'user_menti

In [8]:
# Extract year, month, day, etc.
df['year'] = df['created_at'].dt.year
df['month'] = df['created_at'].dt.month
df['day'] = df['created_at'].dt.day
df['hour'] = df['created_at'].dt.hour
df['minute'] = df['created_at'].dt.minute
df['second'] = df['created_at'].dt.second
print(df.head())

                 created_at                   id               id_str  \
0 2018-07-31 13:34:40+00:00  1024287229525598210  1024287229525598210   
1 2018-07-31 13:34:40+00:00  1024287229512953856  1024287229512953856   
2 2018-07-31 13:34:40+00:00  1024287229504569344  1024287229504569344   
3 2018-07-31 13:34:40+00:00  1024287229496029190  1024287229496029190   
4 2018-07-31 13:34:40+00:00  1024287229492031490  1024287229492031490   

                                                text  truncated  \
0  RT @KWWLStormTrack7: We are more than a month ...      False   
1  @hail_ee23 Thanks love its just the feeling of...      False   
2  RT @TransMediaWatch: Pink News has more on the...      False   
3  RT @realDonaldTrump: One of the reasons we nee...      False   
4  RT @First5App: This hearing of His Word doesn’...      False   

                                            entities  \
0  {'hashtags': [], 'symbols': [], 'user_mentions...   
1  {'hashtags': [], 'symbols': [], 'user_menti

## Explore 

What's the time range of the tweets published ?  

In [17]:
df['hour'].unique()

array([13], dtype=int32)

In [18]:
df['minute'].unique()

array([34], dtype=int32)

In [16]:
df['second'].unique()

array([40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24,
       23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13], dtype=int32)

In [19]:
print(df['text'])

0        RT @KWWLStormTrack7: We are more than a month ...
1        @hail_ee23 Thanks love its just the feeling of...
2        RT @TransMediaWatch: Pink News has more on the...
3        RT @realDonaldTrump: One of the reasons we nee...
4        RT @First5App: This hearing of His Word doesn’...
                               ...                        
11094    RT @sirDukeDevin: Trump keeps talking about "1...
11095    RT @AMAZlNGNATURE: Proud boy finding the best ...
11096    @ChrisDanicic @Mediaite @benshapiro The argume...
11097    RT @jodiequotepic: "I wanted to express the co...
11098    @w_terrence We shouldn't be making fun of peop...
Name: text, Length: 11099, dtype: object
