<a href="https://colab.research.google.com/github/Ajmyquira/tweets-topic-modelling/blob/master/1-tweets-completition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tweets dataset completition

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Reading all the tweets collected

In [None]:
import pandas as pd
import glob
import os

# Merging the files
files_joined = os.path.join('/content/drive/MyDrive/UCSP/Data-Science-Topics/Data', "tweets0*.csv")

# Return a list of all joined files
list_files = glob.glob(files_joined)

print("** Merging multiple csv files into a single pandas dataframe **")
# Merge files by joining all files
tweets = pd.concat(map(pd.read_csv, list_files), ignore_index=True)
tweets

In [None]:
tweets.shape

(2197456, 36)

There are 2 197 456 tweets collected.


In [None]:
# Select the interest columns of 'date' and 'tweet'
tweets_filtered = pd.DataFrame(tweets[['date','tweet']])
tweets_filtered.columns = ['date','text']
tweets_filtered

In [None]:
# Sorting the values by date
tweets_sorted = tweets_filtered.sort_values(by='date', ignore_index=True)
tweets_sorted

Unnamed: 0,date,text
0,2021-07-30,@PedroCastilloTe Hablas igual a Diosdado cabel...
1,2021-07-30,@Camilita_evans @PedroCastilloTe @AlanGarciaPe...
2,2021-07-30,"Si mañana fueran las elecciones, volvería a vo..."
3,2021-07-30,"@PedroCastilloTe solo queda sacar a Bellido, s..."
4,2021-07-30,@ebelinortiz @PedroCastilloTe @KeikoFujimori A...
...,...,...
2197451,2022-04-28,@oscardiazperu @PedroCastilloTe Un cojudo a la...
2197452,2022-04-28,@Frospigliosi No le va a pasar nada a @PedroCa...
2197453,2022-04-28,"#CarlosBasombrío en Canal N, critica al JNE po..."
2197454,2022-04-28,#28Abr 🌎 | Pedro Castillo presentó propuesta p...


In this part, the tweets of some missing days are completed with the tweets of nearby days.

In [None]:
# Current day with tweets
cur_day = ['2021-07-30',
           '2021-07-30',
           '2021-09-24',
           '2021-10-11',
           '2021-11-06',
           '2021-12-14',
           '2022-01-17',
           '2022-01-31',
           '2022-02-16',
           '2022-02-24',
           '2022-03-13',
           '2022-03-29',
           '2022-03-29',
           '2022-04-03',
           '2022-04-03',
           '2022-04-27',
           '2022-04-27']

# New day without tweets
new_day = ['2021-07-28',
           '2021-07-29',
           '2021-09-25',
           '2021-10-10',
           '2021-11-07',
           '2021-12-13',
           '2022-01-16',
           '2022-01-30',
           '2022-02-15',
           '2022-02-23',
           '2022-03-12',
           '2022-03-30',
           '2022-03-31',
           '2022-04-01',
           '2022-04-02',
           '2022-04-29',
           '2022-04-30']

# Select and duplicate the tweets for the current date
tweets_by_day = tweets_sorted[tweets_sorted['date'] == cur_day[0]]
print(tweets_by_day.shape)
# Change the date for the duplicate selection
n_tweets_by_day = tweets_by_day.replace({cur_day[0]: new_day[0]})
# Adding the new tweets
new_tweets = pd.concat([tweets_sorted, n_tweets_by_day])
new_tweets.index = range(new_tweets.shape[0])
n_tweets_sorted = new_tweets.sort_values(by='date', ignore_index=True)

for i in range(1, len(new_day)):
  # Select and duplicate the tweets for the current date
  tweets_by_day = tweets_sorted[tweets_sorted['date'] == cur_day[i]]
  print(tweets_by_day.shape)
  # Change the date for the duplicate selection
  n_tweets_by_day = tweets_by_day.replace({cur_day[i]: new_day[i]})
  # Adding the new tweets
  new_tweets = pd.concat([new_tweets, n_tweets_by_day])
  new_tweets.index = range(new_tweets.shape[0])
  n_tweets_sorted = new_tweets.sort_values(by='date', ignore_index=True)

n_tweets_sorted

(21513, 2)
(21513, 2)
(2484, 2)
(397, 2)
(4237, 2)
(2666, 2)
(2722, 2)
(2930, 2)
(4070, 2)
(1022, 2)
(875, 2)
(11820, 2)
(11820, 2)
(6857, 2)
(6857, 2)
(8022, 2)
(8022, 2)


Unnamed: 0,date,text
0,2021-07-28,@PedroCastilloTe Hablas igual a Diosdado cabel...
1,2021-07-28,@ebelinortiz @PedroCastilloTe @KeikoFujimori Y...
2,2021-07-28,#Perù #México Pedro Castillo desafía a Perú co...
3,2021-07-28,@Perulibreprensa NOS MENTISTE @PedroCastilloTe...
4,2021-07-28,"Parece que a alguien le gustó escuchar que ""ha..."
...,...,...
2315278,2022-04-30,@RichardArcePeru @PedroCastilloTe Es un gobier...
2315279,2022-04-30,@JorgeMunozPe @JNE_Peru Ahora sería bueno que ...
2315280,2022-04-30,@NoDignos @PedroCastilloTe @congresoperu Jsjsjsjs
2315281,2022-04-30,"Now playing Pedro Castillo aosto 13,2021 by Pe..."


In [None]:
# Tranforming the date values type to datetime
n_tweets_date = n_tweets_sorted.copy()
n_tweets_date['date'] = pd.to_datetime(n_tweets_date['date'])

In [None]:
# Grouping the dates by days
n_tweets_date["my_day"] = n_tweets_date.date.dt.date

In [None]:
# Checking the amount of tweets per day
for day, datos in n_tweets_date.groupby("my_day"):
  print("{}: {}".format(day, len(datos)))

2021-07-28: 21513
2021-07-29: 21513
2021-07-30: 21513
2021-07-31: 13787
2021-08-01: 14659
2021-08-02: 11214
2021-08-03: 9933
2021-08-04: 10011
2021-08-05: 10072
2021-08-06: 9739
2021-08-07: 6383
2021-08-08: 6205
2021-08-09: 7155
2021-08-10: 6993
2021-08-11: 8345
2021-08-12: 9047
2021-08-13: 6166
2021-08-14: 5855
2021-08-15: 9431
2021-08-16: 5954
2021-08-17: 7491
2021-08-18: 11441
2021-08-19: 7048
2021-08-20: 11033
2021-08-21: 6610
2021-08-22: 5249
2021-08-23: 6349
2021-08-24: 6035
2021-08-25: 11281
2021-08-26: 4902
2021-08-27: 6162
2021-08-28: 4414
2021-08-29: 2296
2021-08-30: 2820
2021-08-31: 10676
2021-09-01: 6707
2021-09-02: 8932
2021-09-03: 7355
2021-09-04: 6170
2021-09-05: 4202
2021-09-06: 7130
2021-09-07: 5218
2021-09-08: 3409
2021-09-09: 4819
2021-09-10: 6028
2021-09-11: 10859
2021-09-12: 4171
2021-09-13: 5048
2021-09-14: 4170
2021-09-15: 4800
2021-09-16: 4439
2021-09-17: 7495
2021-09-18: 10109
2021-09-19: 9359
2021-09-20: 11032
2021-09-21: 10414
2021-09-22: 6594
2021-09-23: 378

In [None]:
n_tweets_sorted

Unnamed: 0,date,text
0,2021-07-28,@PedroCastilloTe Hablas igual a Diosdado cabel...
1,2021-07-28,@ebelinortiz @PedroCastilloTe @KeikoFujimori Y...
2,2021-07-28,#Perù #México Pedro Castillo desafía a Perú co...
3,2021-07-28,@Perulibreprensa NOS MENTISTE @PedroCastilloTe...
4,2021-07-28,"Parece que a alguien le gustó escuchar que ""ha..."
...,...,...
2315278,2022-04-30,@RichardArcePeru @PedroCastilloTe Es un gobier...
2315279,2022-04-30,@JorgeMunozPe @JNE_Peru Ahora sería bueno que ...
2315280,2022-04-30,@NoDignos @PedroCastilloTe @congresoperu Jsjsjsjs
2315281,2022-04-30,"Now playing Pedro Castillo aosto 13,2021 by Pe..."


In [None]:
# Saving the new tweets data completed
n_tweets_sorted.to_json("/content/drive/MyDrive/UCSP/Data-Science-Topics/Data/all_tweets.json", index=False, orient="split")