# Clean Free Tweets

This notebook mirrors the `clean_free_tweets.py` script so you can explore, clean, and enrich the Free tweet export interactively.



In [6]:
from __future__ import annotations

import re
from pathlib import Path
from typing import List

import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

INPUT_PATH = Path("data/free tweet export.csv")
OUTPUT_PATH = Path("data/free_tweet_export_clean.csv")

MENTION_RE = re.compile(r"(?i)@\w+")
URL_RE = re.compile(r"https?://\S+")
WHITESPACE_RE = re.compile(r"\s+")

ON_TOPIC_KEYWORDS = [
    "free",
    "freebox",
    "free mobile",
    "free pro",
    "free fibre",
    "free delta",
    "free pop",
    "free 5g",
    "free 4g",
    "rÃ©seau free",
    "reseau free",
    "freewifi",
    "free assistance",
    "freebox pop",
    "freebox delta",
]

THEME_KEYWORDS = {
    "reseau": [
        "panne",
        "coupure",
        "rÃ©seau",
        "reseau",
        "connexion",
        "internet",
        "debit",
        "upload",
        "download",
        "ping",
        "fibre",
        "4g",
        "5g",
        "latence",
    ],
    "facturation": [
        "facture",
        "prelevement",
        "prÃ©lÃ¨vement",
        "paiement",
        "remboursement",
        "surfacturation",
        "montant",
        "tarif",
        "prix",
    ],
    "abonnement": [
        "abonnement",
        "resiliation",
        "rÃ©siliation",
        "inscription",
        "offre",
        "contrat",
        "portabilite",
        "portabilitÃ©",
    ],
    "equipement": [
        "box",
        "modem",
        "routeur",
        "player",
        "dÃ©codeur",
        "decodeur",
        "tv",
        "serveur",
    ],
    "support": [
        "service client",
        "hotline",
        "assistance",
        "sav",
        "help",
        "support",
        "ticket",
    ],
}

URGENCY_PATTERNS = [
    r"\burgent[e]?\b",
    r"\bimpossible\b",
    r"\bdepuis\s+\d+\s*(?:jours?|heures?)",
    r"\bdepuis\s+(?:hier|ce matin)\b",
    r"\bhelp\b",
    r"\bsvp\b",
    r"\basap\b",
    r"\bperdu\b",
    r"\baucun service\b",
]



In [7]:
def load_dataset(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, keep_default_na=False, na_values=["null"])
    if "full_text" not in df.columns:
        raise ValueError("Expected 'full_text' column not found in dataset.")
    df["full_text"] = df["full_text"].astype(str)
    return df


def is_retweet(row: pd.Series) -> bool:
    retweeted_status = row.get("retweeted_status")
    if isinstance(retweeted_status, float) and pd.isna(retweeted_status):
        retweeted_status = None
    return (
        (isinstance(retweeted_status, str) and retweeted_status.strip() != "")
        or str(row.get("full_text", "")).strip().lower().startswith("rt @")
    )


def remove_retweets(df: pd.DataFrame) -> pd.DataFrame:
    mask = df.apply(lambda row: not is_retweet(row), axis=1)
    return df[mask].copy()


def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    return df.drop_duplicates(subset=["full_text"]).copy()


def is_on_topic(text: str) -> bool:
    lower_text = text.lower()
    return any(keyword in lower_text for keyword in ON_TOPIC_KEYWORDS)


def filter_off_topic(df: pd.DataFrame) -> pd.DataFrame:
    mask = df["full_text"].fillna("").apply(is_on_topic)
    return df[mask].copy()



In [8]:
def normalize_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text_no_urls = URL_RE.sub(" ", text)
    text_no_mentions = MENTION_RE.sub(" ", text_no_urls)
    ascii_text = text_no_mentions.encode("ascii", "ignore").decode("ascii", errors="ignore")
    normalized = WHITESPACE_RE.sub(" ", ascii_text)
    return normalized.strip()


def detect_theme(text: str) -> str:
    if not text:
        return "autre"
    lower_text = text.lower()
    detected: List[str] = []
    for theme, keywords in THEME_KEYWORDS.items():
        if any(keyword in lower_text for keyword in keywords):
            detected.append(theme)
    return ";".join(sorted(set(detected))) if detected else "autre"


def detect_urgency(text: str) -> bool:
    if not text:
        return False
    for pattern in URGENCY_PATTERNS:
        if re.search(pattern, text.lower()):
            return True
    return False


def annotate_sentiment(
    analyzer: SentimentIntensityAnalyzer, text: str
) -> tuple[str, float]:
    if not text:
        return "neutre", 0.0
    score = analyzer.polarity_scores(text)["compound"]
    if score >= 0.05:
        label = "positif"
    elif score <= -0.05:
        label = "negatif"
    else:
        label = "neutre"
    return label, float(score)



In [9]:
def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    df = df.dropna(subset=["full_text"])
    df = remove_retweets(df)
    df = drop_duplicates(df)
    df = filter_off_topic(df)

    analyzer = SentimentIntensityAnalyzer()
    clean_texts = df["full_text"].apply(normalize_text)
    df["clean_text"] = clean_texts

    sentiments = clean_texts.apply(lambda text: annotate_sentiment(analyzer, text))
    df["sentiment_label"] = sentiments.apply(lambda tup: tup[0])
    df["sentiment_score"] = sentiments.apply(lambda tup: tup[1])

    df["theme"] = clean_texts.apply(detect_theme)
    df["urgent"] = clean_texts.apply(detect_urgency)

    df["has_media"] = df.get("media", "").apply(
        lambda media: bool(media) and str(media) != "[]"
    )
    df["text_length"] = clean_texts.str.len()

    return df.reset_index(drop=True)



In [10]:
df_raw = load_dataset(INPUT_PATH)
df_clean = clean_dataset(df_raw)

OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df_clean.to_csv(OUTPUT_PATH, index=False)

print(f"Clean dataset saved to {OUTPUT_PATH} ({len(df_clean)} tweets)")
df_clean.head()



Clean dataset saved to data/free_tweet_export_clean.csv (5795 tweets)


Unnamed: 0,id,created_at,full_text,media,screen_name,name,profile_image_url,user_id,in_reply_to,retweeted_status,...,retweeted,bookmarked,url,clean_text,sentiment_label,sentiment_score,theme,urgent,has_media,text_length
0,1343458257915031553,2020-12-28 08:26:23 +01:00,"ðŸ’© Ã @free parce-que DÃ©bit TrÃ¨s instable, â€¦ \n\...",[],m_annuel,M Annuel,https://abs.twimg.com/sticky/default_profile_i...,1104790986801250304,,,...,False,False,https://twitter.com/m_annuel/status/1343458257...,"parce-que Dbit Trs instable, \n\nFree en Franc...",neutre,0.0,reseau,False,False,136
1,1418550491034882052,2021-07-23 14:36:07 +02:00,Â« Faites vos premiers pas avec nous ! DÃ©couvre...,"[{""type"":""video"",""url"":""https://t.co/YCMv79evb...",Freebox,Assistance Freebox,https://pbs.twimg.com/profile_images/671676021...,58920430,,,...,False,False,https://twitter.com/Freebox/status/14185504910...,Faites vos premiers pas avec nous ! Dcouvrez v...,neutre,0.0,equipement;support,False,True,99
2,1438534927734169617,2021-09-16 18:07:08 +02:00,FreePlugs et boÃ®tiers CPL personnels : gÃ©rer v...,"[{""type"":""photo"",""url"":""https://t.co/JF6sn3PMy...",Freebox,Assistance Freebox,https://pbs.twimg.com/profile_images/671676021...,58920430,,,...,False,False,https://twitter.com/Freebox/status/14385349277...,FreePlugs et botiers CPL personnels : grer vot...,neutre,0.0,autre,False,True,187
3,1474424640470614017,2021-12-24 18:00:02 +01:00,Les Ã©quipes de lâ€™assistance Free vous souhaite...,"[{""type"":""photo"",""url"":""https://t.co/dhPpFn9nf...",Freebox,Assistance Freebox,https://pbs.twimg.com/profile_images/671676021...,58920430,,,...,False,False,https://twitter.com/Freebox/status/14744246404...,Les quipes de lassistance Free vous souhaitent...,positif,0.5972,support,False,True,153
4,1476961348131049473,2021-12-31 18:00:00 +01:00,Les Ã©quipes de lâ€™assistance Free vous souhaite...,"[{""type"":""photo"",""url"":""https://t.co/Nhz5mq8ku...",Freebox,Assistance Freebox,https://pbs.twimg.com/profile_images/671676021...,58920430,,,...,False,False,https://twitter.com/Freebox/status/14769613481...,Les quipes de lassistance Free vous souhaitent...,positif,0.5972,support,False,True,154


In [13]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5795 entries, 0 to 5794
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5795 non-null   int64  
 1   created_at         5795 non-null   object 
 2   full_text          5795 non-null   object 
 3   media              5795 non-null   object 
 4   screen_name        5795 non-null   object 
 5   name               5795 non-null   object 
 6   profile_image_url  5795 non-null   object 
 7   user_id            5795 non-null   int64  
 8   in_reply_to        3440 non-null   float64
 9   retweeted_status   0 non-null      float64
 10  quoted_status      78 non-null     float64
 11  media_tags         5795 non-null   object 
 12  favorite_count     5795 non-null   int64  
 13  retweet_count      5795 non-null   int64  
 14  bookmark_count     5795 non-null   int64  
 15  quote_count        5795 non-null   int64  
 16  reply_count        5795 