# Tweets Classification 

project carried out by Bouchra BEN GHAZALA

## 1) Get data

In [2]:
import pandas as pd

In [3]:
sample_submission = pd.read_csv('data/sample_submission.csv')

train_data = pd.read_csv('data/training.csv')
test_data = pd.read_csv('data/test.csv')

In [4]:
sample_submission.head()

Unnamed: 0,TweetId,Label
0,13439423987429,Sports
1,48523497520948,Politics
2,183749287598,Sports
3,12749274958729,Sports


In [5]:
train_data.head()

Unnamed: 0,TweetId,Label,TweetText
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...
1,304834304222064640,Politics,'@rraina1481 I fear so'
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...


In [6]:
test_data.head()

Unnamed: 0,TweetId,TweetText
0,306486520121012224,'28. The home side threaten again through Maso...
1,286353402605228032,'@mrbrown @aulia Thx for asking. See http://t....
2,289531046037438464,'@Sochi2014 construction along the shores of t...
3,306451661403062273,'#SecKerry\u2019s remarks after meeting with F...
4,297941800658812928,'The #IPLauction has begun. Ricky Ponting is t...


<span style="color:blue">We notice that the TweetTexts are not cleaned, there are some characters included.</span>

## 2) Data Cleaning

In [19]:
def clean_tweets(text):
    from bs4 import BeautifulSoup
    import re
    import string
    import emoji

    
    punct = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

    punct_mapping = {"‘": "'", "´": "'", "°": "", "€": "euros", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-",
                 "`": "'", '“': '"', '”': '"', '“': '"', "£": "pounds", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 
                 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '!':'' ,'à':'a','â':'a','é':'e','è':'e','ê':'e',
                 'ë':'e','ç':'c','î':'i','ï':'i','ô':'o','û':'u','ù':'u','ü':'u','ÿ':'y','æ':'ae','œ':'oe','À':'A',
                'Â':'A','Ç':'C','É':'E','È':'E','Ê':'E','Ë':'E','Î':'I','Ï':'I','Ô':'O','Û':'U','Ù':'U','Ü':'U','Ÿ':'Y',
                'Æ':'AE','Œ':'OE'}

    # Remove URLs
    url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(url_regex, '', text)
    
    # Replace special characters based on the mapping dictionary
    for p in punct_mapping:
        text = text.replace(p, punct_mapping[p])
    
    # Add spaces around punctuation characters
    for p in punct:
        text = text.replace(p, f' {p} ')
        
    # Remove all punctuation except the single quote character
    text = re.sub("[%s]" % re.escape(string.punctuation.replace("'", "")), '', text)
    
    # Remove text within square brackets
    text = re.sub('\[.*?\]', '', text)
    
    # Remove HTML tags
    text = BeautifulSoup(text, 'lxml').get_text()  
    
    # Remove other HTML tags
    text = re.sub('<.*?>+', '', text)  
    
    # Remove newline characters
    text = re.sub('\n', '', text)  
    
    # Remove words containing digits
    text = re.sub('\w*\d\w*', '', text) 
    
    # Keep French characters
    text = re.sub(r"[^a-zA-ZàâçéèêëîïôûùüÿæœÀÂÇÉÈÊËÎÏÔÛÙÜŸÆŒ?.!,¿']+", " ", text)  


    # Replace multiple spaces with a single space
    text = re.sub(r'[" "]+', " ", text)

    
    # Convert text to lowercase
    text = str(text).lower()  
    
    
    # text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)  # Uncomment to remove French characters
    return text


<span style="color:blue">The clean_tweets function prepares tweet text, it removes URLs, special characters, and most punctuation while retaining French characters. Additionally, it eliminates text within square brackets, HTML tags, newline characters, and words containing digits. Finally, it converts the text to lowercase and reduces multiple spaces to a single space, ensuring optimal formatting for analysis.</span>

In [21]:
train_data['cleaned_tweets'] = train_data['TweetText'].apply(clean_tweets)


In [22]:
train_data.head()

Unnamed: 0,TweetId,Label,TweetText,cleaned_tweets
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...,' seckerry the value of the statedept and usai...
1,304834304222064640,Politics,'@rraina1481 I fear so',' i fear so '
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...,' watch video highlights of the final between ...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...,' rt chelscanlan at nitro circus at albertpark...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...,' cricketfox always a good thing thanks for th...


In [2]:
# 'cleaned_tweets' containing the preprocessed text data.
test_data['cleaned_tweets'] = test_data['TweetText'].apply(clean_tweets)


In [26]:
test_data.head()

Unnamed: 0,TweetId,TweetText,cleaned_tweets
0,306486520121012224,'28. The home side threaten again through Maso...,' the home side threaten again through mason b...
1,286353402605228032,'@mrbrown @aulia Thx for asking. See http://t....,' mrbrown aulia thx for asking see it derives ...
2,289531046037438464,'@Sochi2014 construction along the shores of t...,' construction along the shores of the black sea
3,306451661403062273,'#SecKerry\u2019s remarks after meeting with F...,' seckerry remarks after meeting with foreign ...
4,297941800658812928,'The #IPLauction has begun. Ricky Ponting is t...,' the iplauction has begun ricky ponting is th...


In [38]:
test_data['cleaned_tweets'][1]

"' mrbrown aulia thx for asking see it derives from a series of abbreviations for pound avoirdupois a unit of mass ' "

### Remove short words and stop words

In [39]:
def get_token(df):
    import nltk
    
    df_tokenized_sentences = []
    tokenizer = nltk.RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')
    
    for i in range(len(df)):
        df_tokenized_sentences.append(tokenizer.tokenize(df['cleaned_tweets'][i]))
        
    return df_tokenized_sentences


def get_short_words(df):
    short_words = []
    
    tokens = get_token(df)
    
    for i in tokens:
        for j in i:
            if len(j)<3:
                short_words.append(j)
                
    return tokens

<span style="color:blue">
*  get_token(df):

This function tokenizes the cleaned tweets in the input DataFrame (df) using the nltk.RegexpTokenizer with a regular expression pattern that captures words, contractions, and any non-space characters that are not words.
It iterates through each row of the DataFrame, tokenizes the 'cleaned_tweets', and appends the tokenized sentences to a list.
Finally, it returns the list of tokenized sentences.

*  get_short_words(df):

This function extracts short words (words with a length less than 3 characters) from the tokenized tweets.
It first calls the get_token function to tokenize the tweets.
Then, it iterates through each tokenized sentence and each word within it, checking if the length of the word is less than 3.
Short words are appended to a list.
Finally, it returns the list of tokens, which effectively contains all the tokenized sentences. However, the function name and its purpose seem mismatched. It should return the list of short words if that's the intended behavior.
</span>

In [30]:
short_words = []

train_short_words = get_short_words(train_data)
test_short_words  = get_short_words(test_data)

for i in train_short_words:
    for j in i:
        if len(j)<3:
            short_words.append(j)

for i in test_short_words:
    for j in i:
        if len(j)<3:
            short_words.append(j)

<span style="color:blue">In this part it collects short words, defined as those with a length less than three characters, from both the training and testing datasets into a list named short_words. It utilizes the get_short_words function to extract short words from each dataset and iterates through the resulting tokens, appending qualifying words to the short_words list. This concise process ensures the aggregation of all short words from both datasets for further analysis or usage.</span>

In [40]:
short_words

["'",
 'of',
 'is',
 'in',
 'in',
 'of',
 "'",
 "'",
 'i',
 'so',
 "'",
 "'",
 'of',
 'at',
 "'",
 'rt',
 'at',
 'at',
 "'",
 "'",
 'a',
 "'",
 "'",
 'dr',
 'to',
 "'",
 'be',
 'to',
 'if',
 "'",
 't',
 "'",
 'm',
 'm',
 "'",
 'of',
 "'",
 's',
 'a',
 'to',
 "'",
 'of',
 'a',
 "'",
 "'",
 'to',
 'a',
 "'",
 "'",
 't',
 'to',
 "'",
 'jo',
 'to',
 'he',
 'jo',
 'w',
 "'",
 'by',
 'pm',
 'at',
 'on',
 'n',
 "'",
 'on',
 'to',
 "'",
 'rt',
 'gb',
 "'",
 "'",
 'i',
 'am',
 "'",
 "'",
 'as',
 "'",
 "'",
 "'",
 'on',
 'a',
 "'",
 "'",
 'he',
 'in',
 'as',
 'h',
 't',
 "'",
 "'",
 'as',
 "'",
 "'",
 'ch',
 "'",
 "'",
 'be',
 'a',
 'he',
 "'",
 'll',
 'in',
 'to',
 'in',
 'dc',
 'po',
 "'",
 'is',
 "'",
 "'",
 'n',
 'to',
 's',
 "'",
 "'",
 'is',
 "'",
 "'",
 'to',
 's',
 'it',
 "'",
 's',
 'a',
 'in',
 'as',
 'to',
 'a',
 "'",
 'n',
 "'",
 'of',
 'to',
 'it',
 'is',
 "'",
 "'",
 'ex',
 'be',
 'to',
 "'",
 "'",
 'no',
 'a',
 'a',
 'as',
 'as',
 'an',
 'in',
 "'",
 "'",
 "'",
 'in',
 'by',
 'pm'

In [42]:
!pip install nltk



In [45]:
import nltk
# stop words update
my_stop_words = set()
ponct = ['?', '!',',','.',';','-','_']
stop_words = nltk.corpus.stopwords.words('english')

my_stop_words.update(ponct)
my_stop_words.update(stop_words)
my_stop_words.update(short_words)

<span style="color:blue">This code snippet initializes a set named my_stop_words to store custom stop words. It includes punctuation marks, common English stop words, and previously collected short words. This set is useful for text processing tasks, where removing irrelevant words can enhance model accuracy and efficiency.</span>

In [44]:
my_stop_words

{'!',
 "'",
 ',',
 '-',
 '.',
 ';',
 '?',
 '_',
 'a',
 'ab',
 'about',
 'above',
 'ac',
 'ad',
 'af',
 'after',
 'ag',
 'again',
 'against',
 'ah',
 'ai',
 'ain',
 'aj',
 'ak',
 'al',
 'all',
 'am',
 'an',
 'and',
 'any',
 'ap',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'au',
 'aw',
 'az',
 'b',
 'ba',
 'bb',
 'bd',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'bi',
 'bj',
 'bk',
 'bo',
 'both',
 'bp',
 'br',
 'bt',
 'bu',
 'but',
 'bw',
 'by',
 'c',
 'ca',
 'can',
 'cc',
 'cd',
 'cg',
 'ch',
 'ci',
 'cm',
 'co',
 'couldn',
 "couldn't",
 'cp',
 'cs',
 'ct',
 'd',
 'da',
 'db',
 'dc',
 'dd',
 'de',
 'df',
 'dg',
 'di',
 'did',
 'didn',
 "didn't",
 'dj',
 'dm',
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'dr',
 'du',
 'during',
 'dw',
 'e',
 'each',
 'ec',
 'ed',
 'eg',
 'eh',
 'el',
 'em',
 'en',
 'ep',
 'er',
 'es',
 'et',
 'eu',
 'ex',
 'f',
 'fa',
 'fb',
 'fc',
 'few',
 'ff',
 'fg',
 'fi',
 'fl',
 'fm',
 'fo',
 'for',
 'fr',

In [46]:
def remove_short_words(df):
    tokenizer = nltk.RegexpTokenizer(r'''\w'|\w+|[^\w\s]''')

    
    for i in range(len(df)):
        new_words = []
        try:
            tokens = tokenizer.tokenize(df['cleaned_tweets'][i])
            for token in tokens:
                if token not in my_stop_words:
                    new_words.append(token)
            df['cleaned_tweets'][i] = " ".join(new_words)
        except:
            continue 
            
    return df

<span style="color:blue">This function removes short and irrelevant words from the 'cleaned_tweets' column of the input DataFrame. It tokenizes each tweet, filters out words present in a set of custom stop words (my_stop_words), and reconstructs the tweets with only meaningful words. This filtering process enhances the quality of the tweet data for further analysis.</span>

In [47]:
train_data = remove_short_words(train_data)
test_data  = remove_short_words(test_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".join(new_words)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_tweets'][i] = " ".

In [48]:
train_data

Unnamed: 0,TweetId,Label,TweetText,cleaned_tweets
0,304271250237304833,Politics,'#SecKerry: The value of the @StateDept and @U...,seckerry value statedept usaid measured dollar...
1,304834304222064640,Politics,'@rraina1481 I fear so',fear
2,303568995880144898,Sports,'Watch video highlights of the #wwc13 final be...,watch video highlights final australia west in...
3,304366580664528896,Sports,'RT @chelscanlan: At Nitro Circus at #AlbertPa...,chelscanlan nitro circus albertpark theymakeit...
4,296770931098009601,Sports,'@cricketfox Always a good thing. Thanks for t...,cricketfox always good thing thanks feedback
...,...,...,...,...
6520,296675082267410433,Politics,'Photo: PM has laid a wreath at Martyrs Monume...,photo laid wreath martyrs monument algiers alg...
6521,306677536195231746,Sports,'The secret of the Chennai pitch - crumbling o...,secret chennai pitch crumbling edges solid mid...
6522,306451295307431937,Sports,@alinabhutto he isn't on Twitter either,alinabhutto twitter either
6523,306088574221176832,Sports,'Which England player would you take out to di...,england player would take dinner featuring amp


### Save Cleaned Data 

In [49]:
train_data.to_csv("train_cleaned.csv")
test_data.to_csv("test_cleaned.csv")

# 3) Feature Extraction

In [54]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)  # Choisissez un nombre approprié pour max_features
X = vectorizer.fit_transform(train_data['cleaned_tweets'])


<span style="color:blue">This code snippet uses the TfidfVectorizer from sklearn to transform the cleaned tweet data into a TF-IDF matrix 'X', with a maximum of 1000 features. The 'cleaned_tweets' column from the training dataset is used for this transformation.</span>

# 4) Models Training


In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Liste des modèles à tester
models = [
    LogisticRegression(),
    MultinomialNB(),
    RandomForestClassifier()
]

<span style="color:blue">This creates a list named 'models' containing instances of these classifiers for testing. These models will be evaluated to determine their performance in classifying the tweet data.</span>

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Séparation des données en ensemble d'entraînement et ensemble de validation
X_train, X_val, y_train, y_val = train_test_split(X, train_data['Label'], test_size=0.2, random_state=42)

# Boucle pour entrainer et évaluer chaque modèle
for model in models:
    # Entraînement du modèle
    model.fit(X_train, y_train)
    
    # Faire des prédictions sur l'ensemble de validation
    val_predictions = model.predict(X_val)
    
    # Évaluer la précision du modèle
    val_accuracy = accuracy_score(y_val, val_predictions)
    
    # Afficher la précision du modèle
    print(f"Modèle: {model.__class__.__name__}, Précision: {val_accuracy}")


Modèle: LogisticRegression, Précision: 0.9164750957854406
Modèle: MultinomialNB, Précision: 0.9218390804597701
Modèle: RandomForestClassifier, Précision: 0.9042145593869731


<span style="color:blue">We notice that from the three models, we can choose the one that has higher precision which is MultinomialNB.</span>




# 5) HyperParams improvement

In [72]:
from sklearn.model_selection import GridSearchCV

# Exemple avec un modèle Naive Bayes
parameters = {
    'alpha': [0.5, 1.0, 1.5, 2.0]
}

Best_nb_model = MultinomialNB()
clf = GridSearchCV(Best_nb_model, parameters)
clf.fit(X_train, y_train)


<span style="color:blue">We chose the parameter alpha to improve the model</span>

In [73]:
from sklearn.metrics import classification_report, accuracy_score

# Prédire les labels pour l'ensemble de validation en utilisant le meilleur estimateur trouvé par GridSearchCV
val_predictions = clf.best_estimator_.predict(X_val)

# Calculer les métriques de performance
val_accuracy = accuracy_score(y_val, val_predictions)
print(f'Précision sur l\'ensemble de validation: {val_accuracy}')

# Pour un rapport plus détaillé, y compris le rappel, la précision et le score F1
print(classification_report(y_val, val_predictions))


Précision sur l'ensemble de validation: 0.9233716475095786
              precision    recall  f1-score   support

    Politics       0.94      0.90      0.92       638
      Sports       0.91      0.94      0.93       667

    accuracy                           0.92      1305
   macro avg       0.92      0.92      0.92      1305
weighted avg       0.92      0.92      0.92      1305



<span style="color:blue">The classification report and accuracy score reveal that the best estimator achieved an accuracy of approximately 92.34% on the validation set, with precision, recall, and F1-score metrics demonstrating strong performance across both Politics and Sports classes. Specifically, precision and recall values for both classes are above 0.90, indicating reliable classification performance across the dataset</span>

In [74]:
# Results of GridSearchCV
cv_results = pd.DataFrame(clf.cv_results_)
print(cv_results[['param_alpha', 'mean_test_score']])


  param_alpha  mean_test_score
0         0.5         0.918391
1         1.0         0.918391
2         1.5         0.917050
3         2.0         0.916092


<span style="color:blue">The DataFrame cv_results displays the mean test scores for different alpha values tested during the GridSearchCV process, ranging from 0.5 to 2.0 for the Multinomial Naive Bayes model. It reveals that the model achieved the highest mean test score of approximately 91.84% with alpha values of 0.5 and 1.0, indicating that lower alpha values led to better performance on the validation set.</span>

# Prediction


In [62]:
test_data.head()

Unnamed: 0,TweetId,TweetText,cleaned_tweets
0,306486520121012224,'28. The home side threaten again through Maso...,home side threaten mason bennett gets end long...
1,286353402605228032,'@mrbrown @aulia Thx for asking. See http://t....,mrbrown aulia thx asking see derives series ab...
2,289531046037438464,'@Sochi2014 construction along the shores of t...,construction along shores black sea
3,306451661403062273,'#SecKerry\u2019s remarks after meeting with F...,seckerry remarks meeting foreign minister west...
4,297941800658812928,'The #IPLauction has begun. Ricky Ponting is t...,iplauction begun ricky ponting first player ha...


In [76]:
X_test = vectorizer.transform(test_data['TweetText'])
predictions = clf.predict(X_test)

submission = pd.DataFrame({
    'TweetId': test_data['TweetId'],
    'Label': predictions
})

submission.to_csv('.csv', index=False)


In [79]:
My_sample_submission = pd.read_csv("ML_Tweets_Classification/data/Prediction.csv")

In [80]:
My_sample_submission

Unnamed: 0,TweetId,Label
0,306486520121012224,Sports
1,286353402605228032,Sports
2,289531046037438464,Politics
3,306451661403062273,Politics
4,297941800658812928,Sports
...,...,...
2605,282023761044189184,Sports
2606,303879735006601216,Sports
2607,297956846046703616,Sports
2608,304265049537658880,Politics


<span style="color:blue">The provided code predicts the labels for the test dataset using the trained classifier (clf) and the TF-IDF vectorizer (vectorizer). The predictions are then saved into a DataFrame named 'submission', containing the 'TweetId' and predicted 'Label' columns. After saving this DataFrame to a CSV file, it reads a sample submission file ('Prediction.csv') into a DataFrame named 'My_sample_submission'. The analysis reveals the predicted labels ('Sports' or 'Politics') corresponding to each TweetId in the sample submission file.</span>