# Create training set based on Faitelson's tweets and replies

In [2]:
import pandas as pd

In [3]:
from parse_tweets import *

In [4]:
tweets = pd.read_csv("david_faitelson_tweets.csv",dtype={"tweet_id":"str"}).drop_duplicates(subset="tweet_id",keep="first").reset_index(drop=True)
replies = pd.read_csv("replies.csv",dtype={"tweet_id":"str","reply_id":"str"}).drop_duplicates(subset=["tweet_id",'text'],keep="first").loc[lambda x: x.tweet_id!=x.reply_id].reset_index(drop=True)

In [5]:
print(f"Total tweets: {tweets.shape[0]:,.0f}")
print(f"Total replies: {replies.shape[0]:,.0f}")

Total tweets: 3,974
Total replies: 89,697


In [6]:
tweets['date'] = tweets.text.map(get_date)
tweets['user'] = tweets.text.map(get_user)
tweets['tweet'] = tweets.text.map(format_text)

replies['date'] = replies.text.map(get_date)
replies['user'] = replies.text.map(get_user)
replies['tweet'] = replies.text.map(format_text)

In [7]:
replies['tweet_date'] = replies.tweet_id.map(tweets.set_index("tweet_id").date)

In [8]:
key_words_by_subject = {
    "punch":
        ['putazo','vergazo','madrazo','chingadazo','puño',"tortazo","sopla","moco",'gancho','golpe'],
    "cuauhtemoc":
        ['tepito','joroba','divo','cuau','cuauhtemoc','blanco','cuernavaca','dromedario','camello','cuello','white',"morelos","tlatoani",'emperador'],
    "time_and_location":
        ['veracruz','vera','tarde','puerto','pirata',"ventana","vestidor","jarocho","jarocha","2003",'heroico'],
    "recipient":
        ['papada','cachete','mandibula','quijada']
}

In [9]:
key_words = [x.lower()
 for sublist in key_words_by_subject.values()
 for x in sublist]

len(key_words)

39

In [10]:
replies['keep'] = replies['tweet'].str.lower().apply(lambda txt: any(word in txt for word in key_words))

In [11]:
print(f"Total replies of interest: {replies.keep.sum():,.0f}")
print(f"Percentage of replies of interest: {replies.keep.sum()/replies.shape[0]:.1%}")

Total replies of interest: 22,232
Percentage of replies of interest: 24.8%


In [12]:
used_tweets = replies[replies.keep].tweet_id.unique()
len(used_tweets),tweets.tweet_id.nunique()

(3582, 3974)

In [13]:
df = tweets[tweets.tweet_id.isin(used_tweets)].merge(replies[replies.keep].drop(["tweet_date",'keep'],axis=1),on="tweet_id",how="left")
# df['tweet_x']=df['tweet_x']+"\nRespuesta graciosa:"
print(df.shape)
assert df.tweet_id.nunique()==len(used_tweets)

(22232, 10)


Remove Hashtags at the end, so the model does not learn this

In [14]:
df = df.sample(frac=1, random_state=42)

for i in range(5):
  mask = df.tweet_y.str.split().str[-1].str[:1]=="#"
  df['tweet_y'] = df.tweet_y.mask(mask,lambda x: x.str.replace(r'\s+\S+$', '', regex=True))

In [15]:
df[df.tweet_y.str.split().str[-1].str[:1]=="#"]

Unnamed: 0,tweet_id,text_x,date_x,user_x,tweet_x,reply_id,text_y,date_y,user_y,tweet_y
3352,1011669571131052032,"David Faitelson\n@DavidFaitelson_\n·\nJun 26, ...",2018-06-26,@DavidFaitelson_,El uniforme nigeriano ya se “robó” la jornada ...,1011671561294827520,"Serna\n@Vctornegro10ho1\n·\nJun 26, 2018\n#Put...",2018-06-26,@Vctornegro10ho1,#Putazo
9812,1314441583865991169,"David Faitelson\n@DavidFaitelson_\n·\nOct 9, 2...",2020-10-09,@DavidFaitelson_,#LetsGoYankees\nQuote\nEl Taquero OG\n@Nacion_...,1314557673707245568,Miguel Gómez Osorio\n@MiguelGomez_Os\n·\nOct 9...,2020-10-09,@MiguelGomez_Os,#LetsGoCamello
20547,1448857287573921792,"David Faitelson\n@DavidFaitelson_\n·\nOct 14, ...",2021-10-14,@DavidFaitelson_,Replying to\n@Nacion_Taquero\n#BeatLA,1448857427806466074,"Sexpulvinismo-Swinger\n@AzulNoMas1\n·\nOct 14,...",2021-10-14,@AzulNoMas1,#Putazo


In [16]:
df.drop(df[df.tweet_y.str.split().str[-1].str[:1]=="#"].index,axis=0,inplace=True)

Remove replies that have GIFs

In [17]:
df[df.tweet_y.str.upper().str.contains("GIF")].tweet_y.str.split().str[-1].value_counts()

tweet_y
GIF    164
Name: count, dtype: int64

In [18]:
df['tweet_y'] = df.tweet_y.mask(df.tweet_y.str.endswith("\nGIF"),df.tweet_y.str[:-4])

In [19]:
print(f"Final shape: {df.shape}")

Final shape: (22229, 10)


In [20]:
cutoff_date = "2022-01-01"

In [25]:
df.drop(df[df.tweet_x.map(has_url) | df.tweet_y.map(has_url)].index,axis=0,inplace=True)

In [27]:
df[df.date_x<cutoff_date].to_csv("training_set.csv",index=False)
df[df.date_x>=cutoff_date].to_csv("validation_set.csv",index=False)