### Description
Ce fichier est consacré à l’extraction des caractéristiques en vue de la préparation des données pour l’entraînement des algorithmes de classification.

### Importation

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta

### Chargement et lecture des données

In [2]:
polluters = pd.read_csv("../Datasets/content_polluters.txt", header=None, delimiter="\t", 
                        names=['UserID', 'CreatedAt', 'CollectedAt', 'NumberOfFollowings', 'NumberOfFollowers', 'NumberOfTweets', 
                               'LengthOfScreenName', 'LengthOfDescriptionInUserProfile'])
polluters.head()

Unnamed: 0,UserID,CreatedAt,CollectedAt,NumberOfFollowings,NumberOfFollowers,NumberOfTweets,LengthOfScreenName,LengthOfDescriptionInUserProfile
0,6301,2006-09-18 01:07:50,2010-01-17 20:38:25,3269,3071,861,8,132
1,10836,2006-10-27 14:38:04,2010-06-18 03:35:34,1949,793,226,9,134
2,10997,2006-10-29 09:50:38,2010-04-24 01:12:40,1119,9644,38674,12,158
3,633293,2007-01-14 12:40:10,2010-01-24 11:59:38,2174,6029,12718,11,121
4,717883,2007-01-27 22:14:18,2010-02-06 06:25:58,7731,7029,873,6,70


In [3]:
polluters_followings = pd.read_csv("../Datasets/content_polluters_followings.txt", header=None, delimiter="\t", 
                        names=['UserID', 'SeriesOfNumberOfFollowings'])
polluters_followings.head()

Unnamed: 0,UserID,SeriesOfNumberOfFollowings
0,6301,"3269,3310,3339,3381,3351,3323,3305,3275,3245,3..."
1,10836,"1949,1963,1963,1963,1963,1963,1963,1962,1961,1..."
2,10997,"1119,1119,999,999,1050,1170,1071,799,799,799,8..."
3,633293,"2174,2651,2676,2674,2673,2673,2673,2672,2672,2..."
4,717883,"7731,7737,7737,7741,7741,7741,7740,7740,7749,7..."


In [4]:
polluters_tweets = pd.read_csv("../Datasets/content_polluters_tweets.txt", header=None, delimiter="\t", 
                        names=['UserID', 'TweetID', 'Tweet', 'CreatedAt'])
polluters_tweets.head()

Unnamed: 0,UserID,TweetID,Tweet,CreatedAt
0,6301,5599519501,MELBOURNE ENQUIRY: Seeking a variety of acts f...,2009-11-10 15:14:31
1,6301,5600313663,THE BURLESQUE BOOTCAMP SYDNEY - Open Date tick...,2009-11-10 15:46:05
2,6301,5600328557,THE BURLESQUE BOOTCAMP SYDNEY - Open Date tick...,2009-11-10 15:46:40
3,6301,5600338093,THE BURLESQUE BOOTCAMP SYDNEY - Open Date tick...,2009-11-10 15:47:03
4,6301,5600564863,"Come to ""The Burlesque Bootcamp - Sydney"" Satu...",2009-11-10 15:56:03


In [5]:
legitimate_users = pd.read_csv("../Datasets/legitimate_users.txt", header=None, delimiter="\t", 
                        names=['UserID', 'CreatedAt', 'CollectedAt', 'NumberOfFollowings', 'NumberOfFollowers', 'NumberOfTweets', 
                               'LengthOfScreenName', 'LengthOfDescriptionInUserProfile'])
legitimate_users.head()

Unnamed: 0,UserID,CreatedAt,CollectedAt,NumberOfFollowings,NumberOfFollowers,NumberOfTweets,LengthOfScreenName,LengthOfDescriptionInUserProfile
0,614,2006-07-13 15:30:05,2009-11-20 23:56:21,510,350,3265,10,34
1,1038,2006-07-15 16:12:15,2009-11-16 05:12:11,304,443,4405,7,156
2,1437,2006-07-16 12:29:24,2009-11-16 16:25:12,45,73,725,6,37
3,2615,2006-07-19 23:23:55,2009-11-27 18:34:36,211,230,211,7,0
4,3148,2006-07-26 14:17:22,2009-11-20 17:35:18,7346,7244,11438,8,97


In [6]:
legitimate_users_followings = pd.read_csv("../Datasets/legitimate_users_followings.txt", header=None, delimiter="\t", 
                        names=['UserID', 'SeriesOfNumberOfFollowings'])
legitimate_users_followings.head()

Unnamed: 0,UserID,SeriesOfNumberOfFollowings
0,614,"664,664,664,665,665,665,665,665,665,665,665,66..."
1,1038,"378,378,378,378,378,378,378,378,378,378,378,37..."
2,1437,"59,59,59,59,59,59,59,59,59,59,59,59,59,59,59,5..."
3,2615,"262,262,262,262,262,262,262,262,262,262,264,26..."
4,3148,"5238,5238,5238,5238,5238,5238,5238,5238,5238,5..."


In [7]:
legitimate_users_tweets = pd.read_csv("../Datasets/legitimate_users_tweets.txt", header=None, delimiter="\t", 
                        names=['UserID', 'TweetID', 'Tweet', 'CreatedAt'])
legitimate_users_tweets.head()

Unnamed: 0,UserID,TweetID,Tweet,CreatedAt
0,614,5912305459,… at house party in Daybreak. Not as weird as ...,2009-11-20 23:52:52
1,614,5908467165,Taxiing — at SLC Salt Lake City International...,2009-11-20 20:42:48
2,614,5904901963,Almost home! — at PDX Portland International ...,2009-11-20 18:11:01
3,614,5900351610,Lunch! — at Ten01 http://gowal.la/s/Awa,2009-11-20 15:04:42
4,614,5900312627,Mm … books — at @Powells http://gowal.la/s/6fe,2009-11-20 15:03:06


### Pipeline de traitement des données

In [8]:

#Calcul la durée de vie en mois d'un compte
def calculate_lifetime_account(df_user) :
    df_user['CreatedAt'] = pd.to_datetime(df_user['CreatedAt'])
    df_user['CollectedAt'] = pd.to_datetime(df_user['CollectedAt'])
    df_user['LifetimeAccountInMonth'] = df_user.apply(
        lambda row: (relativedelta(row['CollectedAt'], row['CreatedAt']).years * 12) + relativedelta(row['CollectedAt'], row['CreatedAt']).months, axis=1)
    return df_user
    
#Calcul le rapport entre deux nombres
def calculate_ratio(nb1, nb2):
    return round(nb1/nb2, 2) if nb2 != 0 else 0

#Calcul le rapport following/follower
def calculate_ratio_following_wer(df_user):
    df_user['FollowingFollowerRatio'] = df_user.apply(
        lambda row: calculate_ratio(row['NumberOfFollowings'], row['NumberOfFollowers']), axis=1)
    return df_user

#Calcul le nombre moyen de tweets par jour
def calculate_mean_tweets_per_day(df_tweet):
    #Conversion de la colonne en format datetime et recuperation de la date
    df_tweet['CreatedAt'] = pd.to_datetime(df_tweet['CreatedAt'])
    df_tweet['CreatedAtDate'] = df_tweet['CreatedAt'].dt.date
    
    nbTweetsPerUser = df_tweet.groupby('UserID')['TweetID'].count() #nombre de tweets crees par utilisateur
    nbDayOfTweetsPerUser = df_tweet.groupby('UserID')['CreatedAtDate'].nunique() #nombre de jour unique ou des tweets ont ete crees
    #Calcul du nombre moyen de tweets par jour par utilisateur et stcokage sous forme de dataframe
    userMeanTweets = round(nbTweetsPerUser / nbDayOfTweetsPerUser, 0).reset_index(name='MeanTweetsPerDay')
    
    #fusion du dataframe des nombres moyens de tweets avec le dataframe principal
    df_tweet = df_tweet.merge(userMeanTweets, on='UserID', how='left')
    df_tweet = df_tweet.drop(columns=['CreatedAtDate'])
    return df_tweet

    
#Calcul la proportion d’URL dans les tweets
def calculate_url_in_tweets_rate(df_tweet):
    tweetWithURL = df_tweet[df_tweet['Tweet'].str.contains('http', na=False)] #Recuperation et stockage des tweets contenant 'http'
    nbTweetWithURL = tweetWithURL.groupby('UserID')['TweetID'].count() #Nombre de tweet contenant 'http' par utilisateur
    nbTotalTweets = df_tweet.groupby('UserID')['TweetID'].count() #Nombre total de tweets par utilisateur
    
    #division du nombre de tweets avec 'http' par le nombre de tweets total et stockage sous forme de dataframe 
    userUrlTweetsRate = round(nbTweetWithURL / nbTotalTweets,2).reset_index(name='UrlInTweetsRate').fillna(0)
    df_tweet = df_tweet.merge(userUrlTweetsRate, on='UserID', how='left') #fusion du dataframe resultant avec le dataframe principal
    return df_tweet


#Calcul la proportion de mentions @ dans les tweets
def calculate_mentions_rate(df_tweet):
    #recuperation de tous les mots d'un tweet sous forme de liste, commencant par @ et n'etant pas suivi d'un '.'
    #pour eviter les adresses email
    df_tweet['Mentions'] = df_tweet['Tweet'].str.findall(r'(?:^|\s)(@[\w_]+)(?!\.[a-z]{2,})')
    #recuperation des tweets ayant des mentions
    tweetWithMentions = df_tweet[df_tweet['Mentions'].apply(lambda x: isinstance(x, list) and len(x) != 0)]
    nbTweetWithMentions = tweetWithMentions.groupby('UserID')['TweetID'].count() #nombre de tweets avec mentions par utilisateur
    nbTotalTweets = df_tweet.groupby('UserID')['TweetID'].count() #nombre total de tweets par utilisateur
    
    #Division du nombre de tweet avec mentions par le nombre de tweets total et stockage sous forme de dataframe
    userMentionsRate = round(nbTweetWithMentions / nbTotalTweets, 2).fillna(0).reset_index(name='UserMentionsRate')
    df_tweet = df_tweet.merge(userMentionsRate, on='UserID', how='left') #fusion du dataframe resultant avec le dataframe principal
    df_tweet = df_tweet.drop(columns=['Mentions'])
    return df_tweet


#Calcul le temps moyen et maximal entre deux tweets consécutifs
def calculate_mean_and_max_time_between_tweets(df_tweet):
    df_tweet['CreatedAt'] = pd.to_datetime(df_tweet['CreatedAt'])
    df_tweet = df_tweet.sort_values(by=['UserID','CreatedAt']) #trie des tweets par utilisateur et par ordre chronologique
    #Calcul en secondes de l'ecart de temps entre deux tweets consecutifs par utilisateur
    df_tweet['TimeSecsBetweenTweets'] = df_tweet.groupby('UserID')['CreatedAt'].diff().dt.total_seconds().fillna(0)

    #calcul de la moyenne en secondes des ecarts de temps les tweets par utilisateur et stcokage sous forme de dataframe
    userMeanTimeBetweenTweets = df_tweet.groupby('UserID')['TimeSecsBetweenTweets'].mean().round(0).reset_index(
        name='MeanTimeSecBetweenTweets')
     #calcul en secondes des ecart de temps maximum entre les tweets par utilisateur et stcokage sous forme de dataframe
    userMaxTimeBetweenTweets = df_tweet.groupby('UserID')['TimeSecsBetweenTweets'].max().round(0).reset_index(
        name='MaxTimeSecBetweenTweets')
    
    df_tweet = df_tweet.merge(userMeanTimeBetweenTweets, on='UserID', how='left') #fusion des ecarts de temps moyen avec le dataframe principal
    df_tweet = df_tweet.merge(userMaxTimeBetweenTweets, on='UserID', how='left')  #fusion des ecarts de temps max avec le dataframe principal
    df_tweet = df_tweet.drop(columns=['TimeSecsBetweenTweets'])
    return df_tweet


#Calcul le nombre moyen des mentions @ par tweet(caracteristiques supplementaires)
def calculate_mean_nb_mentions_per_tweets(df_tweet):
    #recuperation de tous les mots d'un tweet sous forme de liste, commencant par @ et n'etant pas suivi d'un '.'
    #pour eviter les adresses email
    df_tweet['Mentions'] = df_tweet['Tweet'].str.findall(r'(?:^|\s)(@[\w_]+)(?!\.[a-z]{2,})')
    #recuperation des tweets ayant des mentions
    tweetWithMentions = df_tweet[df_tweet['Mentions'].apply(lambda x: isinstance(x, list) and len(x) != 0)]
    nbTweetWithMentions = tweetWithMentions.groupby('UserID')['TweetID'].count() #nombre de tweets avec mentions par utilisateur
    nbMentions = df_tweet.explode('Mentions').groupby('UserID')['Mentions'].count() #nombre total de mentions par utilisateur
    
    #Division du nombre de total de mentions par le nombre de tweets avec mentions
    meanNbMentionsPerTweet = round(nbMentions / nbTweetWithMentions, 0).fillna(0).reset_index(name='MeanNbMentionsPerTweet')
    df_tweet = df_tweet.merge(meanNbMentionsPerTweet, on='UserID', how='left') #fusion du dataframe obtenu avec le dataframe principal
    df_tweet = df_tweet.drop(columns=['Mentions'])
    return df_tweet


#Calcul la similarite de jaccard entre deux tweets
def jaccard_similarity(tweet1, tweet2):
    #Separation des tweets en liste de mots uniques    
    wordsTweet1 = set(tweet1.lower().split()) 
    wordsTweet2 = set(tweet2.lower().split())
    
    intersection = len(wordsTweet1 & wordsTweet2) #nombre de mots communs aux deux tweets
    union = len(wordsTweet1 | wordsTweet2) #nombre de mots uniques a chacun des tweets
    
    return round(intersection / union,2) if union != 0 else 0


#Calcul la valeur moyenne des similarites de jaccard entre tweets consecutifs de chaque utiliseur
def calculate_mean_jaccard_similarity(df_tweet):
    df_tweet['CreatedAt'] = pd.to_datetime(df_tweet['CreatedAt'])
    df_tweet = df_tweet.sort_values(by=['UserID','CreatedAt']) #Trier les tweets par utilisateur et par date
    df_tweet['JaccardSimilarity'] = np.nan #Initialiser le colonne des similarites entre les tweets

    #Parcours de chaque utilisateur
    for user_id, group in df_tweet.groupby('UserID'):
        similarities = [np.nan] #Initialisation de la similarite du premier tweet
        # Extraction des tweets de l'utilisateur sous forme de liste en s'assurant de la validite des chaines de caracteres
        tweets = group['Tweet'].astype(str).fillna("").tolist() 
        
        #Parcours de chaque tweet de l'utilisateur en commencant par le deuxieme
        for i in range(1, len(tweets)):
            sim = jaccard_similarity(tweets[i-1], tweets[i]) #Comparaison de la similarite entre les tweets courant et precedent
            similarities.append(sim) #Ajout du resultat dans la liste des similarites
        df_tweet.loc[group.index, 'JaccardSimilarity'] = similarities #Assignation de l'indice de similarite a l'index correspondant

    #Calcul la valeur moyenne de similarite en stockage sous forme de dataframe
    meanJaccardSimilarity = df_tweet.groupby('UserID')['JaccardSimilarity'].mean().round(2).fillna(0).reset_index(name='MeanJaccardSimilarity')
    df_tweet = df_tweet.merge(meanJaccardSimilarity, on='UserID', how='left') #fusion du dataframe obtenu avec le dataframe principal
    df_tweet = df_tweet.drop(columns=['JaccardSimilarity']) 
    return df_tweet

#Calcul les caractéristiques en lien le compte utilisateur
def calculate_user_features(df_user):
    df_user = calculate_lifetime_account(df_user)
    df_user = calculate_ratio_following_wer(df_user)
    return df_user

#Calcul les caractéristiques en lien avec les tweets
def calculate_tweet_features(df_tweet):
    df_tweet = calculate_mean_tweets_per_day(df_tweet)
    df_tweet = calculate_url_in_tweets_rate(df_tweet)
    df_tweet = calculate_mentions_rate(df_tweet)
    df_tweet = calculate_mean_and_max_time_between_tweets(df_tweet)
    df_tweet = calculate_mean_nb_mentions_per_tweets(df_tweet)
    df_tweet = calculate_mean_jaccard_similarity(df_tweet)
    return df_tweet

#Calcul et fusionne les caracteristiques
def calculate_and_merge_features(df_user, df_tweet, df_followings, typeOfUser):
    df_user = calculate_user_features(df_user)
    df_tweet = calculate_tweet_features(df_tweet)
    #Renommage des colonnes pour eviter les confusions
    df_user = df_user.rename(columns={'CreatedAt':'CreatedAt_Acc'})
    df_tweet = df_tweet.rename(columns={'CreatedAt':'CreatedAt_Tweet'})
    #Fusion des dataframes
    merge_df = df_user.merge(df_followings, on='UserID', how='left')
    merge_df = df_tweet.merge(merge_df, on='UserID', how='left')
    #Ajout de la colonne indiquant la classe d'utilisateur
    merge_df['TypeOfUser'] = typeOfUser
    return merge_df

### Préparation des données

In [9]:
# Calcul et fusion des caracteristiques
polluters = calculate_and_merge_features(polluters, polluters_tweets, polluters_followings, 1)
legitimate = calculate_and_merge_features(legitimate_users, legitimate_users_tweets, 
                                          legitimate_users_followings, 0)

#Fusion des differents types d'utilisateurs
user_account = pd.concat([polluters, legitimate], axis=0, ignore_index=True)

#Suppression de toutes les lignes comprennant des doublons au niveau de UserID et TweetID car classé à la fois utilisateur légitime et pollueur
user_account = user_account.drop_duplicates(subset=["UserID", "TweetID"], keep=False)

#Suppression des doublons au niveau de UserID
user_account = user_account.drop_duplicates(subset=["UserID"])

#Selection des colonnes pertinentes pour l'apprentissage
colsToConsider = ['UserID', 'LengthOfScreenName', 'LengthOfDescriptionInUserProfile',
                'LifetimeAccountInMonth', 'NumberOfFollowings', 'NumberOfFollowers', 'NumberOfTweets',
                'FollowingFollowerRatio', 'MeanTweetsPerDay', 'UrlInTweetsRate', 'UserMentionsRate', 
                'MeanTimeSecBetweenTweets', 'MaxTimeSecBetweenTweets', 'MeanNbMentionsPerTweet',
                'MeanJaccardSimilarity', 'TypeOfUser']
user_account = user_account[colsToConsider]

### Exportation et affichage du rendu final

In [12]:
#Exportation des données sous format csv
user_account.to_csv("../Datasets/user_account.csv", index=False)

In [13]:
user_account

Unnamed: 0,UserID,LengthOfScreenName,LengthOfDescriptionInUserProfile,LifetimeAccountInMonth,NumberOfFollowings,NumberOfFollowers,NumberOfTweets,FollowingFollowerRatio,MeanTweetsPerDay,UrlInTweetsRate,UserMentionsRate,MeanTimeSecBetweenTweets,MaxTimeSecBetweenTweets,MeanNbMentionsPerTweet,MeanJaccardSimilarity,TypeOfUser
0,6301,8,132,39,3269,3071,861,1.06,4.0,0.63,0.24,29386.0,264985.0,1.0,0.07,1
200,10836,9,134,43,1949,793,226,2.46,2.0,0.60,0.09,262448.0,7649040.0,1.0,0.11,1
398,10997,12,158,41,1119,9644,38674,0.12,49.0,0.26,0.51,1697.0,43858.0,1.0,0.05,1
595,633293,11,121,36,2174,6029,12718,0.36,11.0,0.32,0.36,7514.0,74830.0,1.0,0.03,1
795,717883,6,70,36,7731,7029,873,1.10,10.0,0.68,0.08,13066.0,354362.0,1.0,0.09,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5580059,93390990,11,0,0,5,0,5,0.00,5.0,0.80,0.00,760.0,3351.0,0.0,0.14,0
5580064,93402679,12,0,0,20,1,1,20.00,1.0,0.00,0.00,0.0,0.0,0.0,0.00,0
5580065,93419256,8,0,0,0,0,1,0.00,1.0,0.00,0.00,0.0,0.0,0.0,0.00,0
5580066,93426370,10,0,0,20,1,1,20.00,1.0,0.00,0.00,0.0,0.0,0.0,0.00,0
