In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
path = '../data/'
files = ['toots_BIG.csv', 'toots_BIG_2.csv', 'toots_BIG_3.csv']

# Load, Merge & Preprocess

In [3]:
# Step 1: Load and Concatenate the CSV files

dataframes = []

for file in files:
    df = pd.read_csv(path+file, usecols=['id', 'language', 'created_at', 'content', 'card.title', 'card.description'])
    dataframes.append(df)

# Concatenate all dataframes into one
big_df = pd.concat(dataframes, ignore_index=True)

# Display the first few rows of the concatenated dataframe
big_df.head()


Unnamed: 0,id,created_at,language,content,card.title,card.description
0,111716689622320123,2024-01-07 21:03:21.708000+00:00,en,<p>Chikmagalur Tourist Places: Your Ultimate G...,,
1,111716689610326246,2024-01-07 21:03:16+00:00,en,"<p>Dancing Adélie Penguins, McMurdo Sound, An...",,
2,111716689604989316,2024-01-07 21:02:45.688000+00:00,,2 Macdonald trip leaving Burrard Station @ Bay...,,
3,111716689599173639,2024-01-07 21:03:19+00:00,en,"<p>Here you go seekers, some more good music (...",,
4,111716689598556513,2024-01-07 21:01:16+00:00,en,"<p>Reiterating <a href=""https://www.sportingne...",,


In [4]:
# Step 2: Merge Text Columns
# Merge 'content' with 'card.title' and 'card.description' if they exist
big_df['content'] = big_df['content'].fillna('') + big_df['card.title'].fillna('') + big_df['card.description'].fillna('')

# Keep only the relevant columns
big_df = big_df[['id', 'language', 'created_at', 'content']]

# Step 3: Remove Duplicates and Empty Contents
# Count the number of rows before removing duplicates
initial_count = len(big_df)

# Remove duplicates based on 'id'
big_df = big_df.drop_duplicates()

# Count the number of duplicates removed
duplicates_removed = initial_count - len(big_df)
print(f"Number of duplicates removed: {duplicates_removed}")

# Count the number of rows before removing empty content
count_before_empty_removal = len(big_df)

# Remove rows where 'content' is empty
big_df = big_df[big_df['content'].str.strip() != '']

# Count the number of empty content rows removed
empty_contents_removed = count_before_empty_removal - len(big_df)
print(f"Number of empty content rows removed: {empty_contents_removed}")

# Display the first few rows of the processed dataframe
print(big_df.head())


Number of duplicates removed: 330051
Number of empty content rows removed: 1644
                   id language                        created_at  \
0  111716689622320123       en  2024-01-07 21:03:21.708000+00:00   
1  111716689610326246       en         2024-01-07 21:03:16+00:00   
2  111716689604989316      NaN  2024-01-07 21:02:45.688000+00:00   
3  111716689599173639       en         2024-01-07 21:03:19+00:00   
4  111716689598556513       en         2024-01-07 21:01:16+00:00   

                                             content  
0  <p>Chikmagalur Tourist Places: Your Ultimate G...  
1  <p>Dancing Adélie Penguins, McMurdo Sound, An...  
2  2 Macdonald trip leaving Burrard Station @ Bay...  
3  <p>Here you go seekers, some more good music (...  
4  <p>Reiterating <a href="https://www.sportingne...  


In [5]:
from bs4 import BeautifulSoup
import re
from tqdm import tqdm

# Step 3: Clean text
# Ensure tqdm is integrated with pandas
tqdm.pandas()

def clean_text(text):
    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()

    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)

    # Remove or replace emojis and emoticons
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    return text

# Assuming big_df is your main DataFrame
big_df['cleaned_content'] = big_df['content'].progress_apply(clean_text)

# Display the first few rows of the cleaned content
big_df[['content', 'cleaned_content']].head()


100%|██████████| 102459/102459 [00:16<00:00, 6198.32it/s]


Unnamed: 0,content,cleaned_content
0,<p>Chikmagalur Tourist Places: Your Ultimate G...,Chikmagalur Tourist Places: Your Ultimate Guid...
1,"<p>Dancing Adélie Penguins, McMurdo Sound, An...","Dancing Adélie Penguins, McMurdo Sound, Antar..."
2,2 Macdonald trip leaving Burrard Station @ Bay...,2 Macdonald trip leaving Burrard Station @ Bay...
3,"<p>Here you go seekers, some more good music (...","Here you go seekers, some more good music (the..."
4,"<p>Reiterating <a href=""https://www.sportingne...",Reiterating


# Detect Language

In [6]:
import torch
print(torch.__version__)
print('CUDA available:', torch.cuda.is_available())


2.1.1
CUDA available: True


In [7]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import torch
from tqdm import tqdm

# Load model and tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
model = XLMRobertaForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")

# Check if CUDA (GPU support) is available and move the model to GPU if it is
if torch.cuda.is_available():
    model = model.to('cuda')

def detect_language(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    if torch.cuda.is_available():
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax().item()
    return model.config.id2label[predicted_class_id]

tqdm.pandas()
big_df['detected_language'] = big_df['content'].progress_apply(detect_language)

print(big_df[['content', 'detected_language']].head())


100%|██████████| 102459/102459 [10:51<00:00, 157.34it/s]

                                             content detected_language
0  <p>Chikmagalur Tourist Places: Your Ultimate G...                en
1  <p>Dancing Adélie Penguins, McMurdo Sound, An...                en
2  2 Macdonald trip leaving Burrard Station @ Bay...                en
3  <p>Here you go seekers, some more good music (...                en
4  <p>Reiterating <a href="https://www.sportingne...                hi





In [8]:
import plotly.express as px
import pandas as pd

# Assuming big_df is your DataFrame with 'language' and 'detected_language' columns

# Get the count of each language and sort them
language_counts = big_df['language'].value_counts().sort_values(ascending=False)
detected_language_counts = big_df['detected_language'].value_counts().sort_values(ascending=False)

# Create a list of languages sorted by their overall frequency in the dataset
sorted_languages = (language_counts + detected_language_counts).sort_values(ascending=False).index

# Create histograms
fig_language = px.histogram(big_df, x='language', category_orders={'language': sorted_languages}, color='language',
                            title='Original Language Distribution')
fig_detected_language = px.histogram(big_df, x='detected_language', category_orders={'detected_language': sorted_languages}, color='detected_language',
                                     title='Detected Language Distribution')

# Show the histograms
fig_language.show()
fig_detected_language.show()


In [12]:
# Isolate where language is 'en' and detected_language is something else
df_language_en_detected_not_en = big_df[(big_df['language'] == 'en') & (big_df['detected_language'] != 'en')]
df_language_en_detected_not_en['content'].head(10)

4     <p>Reiterating <a href="https://www.sportingne...
7                                              <p>🐶</p>
16    <p>Okay, das wars wohl mit den Playoffs. <a hr...
26    <p>GitUI<br>L: <a href="https://github.com/ext...
28    <p><a href="https://mastodon.social/tags/TOHO"...
39    <p>ICYMI: <a href="https://mastodon.social/tag...
44    <p>Finisce l'avventura di Roma in Coppa Italia...
45    <p>Lenio Streck: &quot;Não conseguimos punir o...
46    <p>Endividamento global</p><p>Confira! 👇<br />...
48    <p><a href="https://mastodon.social/tags/K%C3%...
Name: content, dtype: object

In [14]:
# Isolate where language is something else and detected_language is 'en'
df_language_not_en_detected_en = big_df[(big_df['language'] != 'en') & (big_df['detected_language'] == 'en')]
df_language_not_en_detected_en['content'].head(10)

2     2 Macdonald trip leaving Burrard Station @ Bay...
23                  Img made with AI app from my photo.
34    <p>Ride it, if you can. </p><p><a href="https:...
41    <p>Putin’s air force fire on their own soldier...
77    Southbound [R] trains are running with severe ...
82                  Img made with AI app from my photo.
87    Route 8 experiencing delays of up to 20 minute...
90    <p>Weather report for <a href="https://troet.c...
95    <p>EDIT: no! Egyptian Big Ben-like chimes, ID ...
98    Lowell Line Train 2310 (3:30 pm from Lowell) i...
Name: content, dtype: object

**Conclusion:** detected_language column is more reliable.

In [15]:
# Replace 'content' column with 'cleaned_content'
big_df['content'] = big_df['cleaned_content']

# Replace 'language' column with 'detected_language'
big_df['language'] = big_df['detected_language']

# Drop the 'cleaned_content' and 'detected_language' columns as they are no longer needed
big_df = big_df.drop(columns=['cleaned_content', 'detected_language'])

# Save the updated DataFrame to a CSV file
csv_file_path = '../data/toots_final.csv'  # You can change the file name as needed
big_df.to_csv(csv_file_path, index=False)