# 0. Initialize

## 0.1. Import Libraries

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, glob
import gzip
import random
import tqdm
import json
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

from IPython import display
import matplotlib as mpl
from matplotlib import pyplot as plt

## 0.2. DEFINE VARIABLES 

In [2]:
from google.colab import drive
drive.mount("./drive")

DATA_PATH = './drive/My Drive/' # '<insert-your-training-data-path-here>'

ROUND = 3 # This project will have 3 rounds of predictions: 1,2,3
STUDENT_ID = '28146'#'<insert-your-id-here>'
PROJECT_CODE = 'CS4124376a150a4b3'#'<insert-your-code-here>' # Same code for the annotation eg. CS412xxxxx

Mounted at ./drive


## 0.3. Read Training & Evaluation Data

### 0.3.1. Get the labels for tweets

In [3]:
#trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH))
trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH), dtype={'tweet_id': str, 'isPolitical': str})
trainingTweetDf

Unnamed: 0,tweet_id,isPolitical
0,1597170281545551872,Yes
1,1431700027471192069,No
2,1566035577090281472,Yes
3,1591538690869940225,Yes
4,1583898169238167554,Yes
...,...,...
2995,1593539327623151619,Yes
2996,1393886554062524418,No
2997,1597925615092764672,Yes
2998,1585291418616176640,Yes


In [4]:
trainingTweetDf.isPolitical.value_counts()

Yes    2003
No      997
Name: isPolitical, dtype: int64

### 0.3.2. Get the labels for users

In [5]:
trainingUserDf = pd.read_csv('{}training-user.csv'.format(DATA_PATH))
#trainingUserDf = pd.read_csv('training-user.csv')
trainingUserDf

Unnamed: 0,screen_name,isBot
0,koftecancaddy,No
1,ahaber,No
2,selahat03949652,No
3,erdin06357062,No
4,bhct__necatii,No
...,...,...
2995,djblumenberg,No
2996,mel1sq,No
2997,eren_yz1,Yes
2998,ergnyildiz4,No


In [6]:
trainingUserDf.isBot.value_counts()

No     2424
Yes     576
Name: isBot, dtype: int64

### 0.3.3. Expand your dataset with metadata and tweets

In [7]:
# You can also expand training data by downloading your own labeled datasets following the link
# Download the documents under "Link to training data"

print('http://www.onurvarol.com/Annotation-CS412-202201/reports/report_{}.html'.format(PROJECT_CODE))

http://www.onurvarol.com/Annotation-CS412-202201/reports/report_CS4124376a150a4b3.html


# 1. EXTRACT FEATURES
Under *1.1. Political Tweet Detection* and *1.2. Bot Detection*, we firstly collect raw data for processing. We then combine some of them (total_interactions = num_favorites + num_retweets) or use them to extract features (whether the tweet has one of the political entities @meralaksener, @kilicdarogluk etc.).

We expect you to collect more raw data from **tweet_metadata**, **user_profiles** and **user_tweets** files by creating a function as shown in below examples such as *check_if_retweet()* and using it while iterating over data as shown under *Merge Collected Features*.

We also expect you to create new variables as much as you can from the data in order to make your predictions more accurate. For example, you may want to check:

- The tweet sources that a user frequently uses
- Whether the user is a verified account or not

...

to assess whether **a user is a bot or not** and whether **a tweet is political or not**.

In [8]:
PATH_TO_DOWNLOADED = './drive/My Drive/' # 'D:/Users/suuser/Desktop/Sabancı/CS412/spring-2022/project/'

## 1.1. Political Tweet Detection
This part stands for the feature extraction of tweets. We start with collecting the raw data from *tweet_metadata*, then use some of them to extract features.

### 1.1.1. Get Raw Data

#### 1.1.1.1. Check if Retweet

In [9]:
def check_if_retweet(tweet_metadata_line):
    is_retweet = 0
    retweeted_username = None

    try:
        tweet_metadata_line['retweeted_status']
        retweeted_username = tweet_metadata_line['retweeted_status']['user']['screen_name'].lower()
        is_retweet = 1

    except KeyError:
        pass

    return is_retweet, retweeted_username

def get_tweet_text(tweet_metadata_line):
    text = tweet_metadata_line['text']
    
    return text

def get_tweet_id(tweet_metadata_line):
    id_str = tweet_metadata_line['id_str']
    
    return id_str

def get_number_mentions_hashtags(tweet_metadata_line):
    num_mentions = len(tweet_metadata_line['entities']['user_mentions'])
    num_hashtags = len(tweet_metadata_line['entities']['hashtags'])

    return num_mentions, num_hashtags

def get_number_retweets_favorites(tweet_metadata_line):
    retweet_count = tweet_metadata_line['retweet_count']
    favorite_count = tweet_metadata_line['favorite_count']
    
    return retweet_count, favorite_count

def get_user_info(tweet_metadata_line):
    id = tweet_metadata_line['user']['id_str']
    screen_name = tweet_metadata_line['user']['screen_name'].lower()
    description = tweet_metadata_line['user']['description']
    is_verified = tweet_metadata_line['user']['verified']
    follower = tweet_metadata_line['user']['followers_count']
    following = tweet_metadata_line['user']['friends_count']
    tweet_num = tweet_metadata_line['user']['statuses_count']

    return id, screen_name, description, is_verified, follower, following, tweet_num

def check_political_ent(text):
    
    # the list below can be modified and some new names may be added (or removed)
    list_of_entities = ['akşener', 'aksener', 'gündem', 'kürt', 'adalet', 'hukuk', 'terör', 'işleri', 'bakan', 'pkk', 'ypg', 'kamu', 'amerika', 'parti', 'muhafaza', 'vergi',
                    'solcu', 'ülke', 'afgan', 'yunan', 'atatürk', 'erdoğan', 'erdogan', 'suriye', 'memleket', 'vatan', 'haber','akp', 'chp', 'mhp', 'hdp', 'zafer', 'kalkınma', 'iyi parti', 'türk', 'millet', 'devlet', 'kılıçdaroğlu', 'kilicdaroglu',
                    'muharrem ince', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'ozdag', 'özdağ', 'TBMM','drfahrettinkoca', 'yenisafak', 'tayyip', 'cumhur', 'belediye', 'baskan', 'başkan', 'ulusal',
                    'odatv', 'suleyman', 'haskologlu', 'mansur', 'dbdevletbahceli', 'Ahmet_Davutoglu', 'babacan', 'gazetesozcu', 'imamoglu', 'imamoğlu', 'parlament', 'meclis', 
                    'savaş', 'eğitim', 'egitim', 'dolar', 'lira', 'enflasyon', 'euro', 'döviz', 'altın', 'benzin', 'atama', 'altılı masa', 'abd', 'avrupa', 'almanya', 'nato',
                    'sınır', 'göçmen', 'gocmen', 'sığınmacı', 'mülteci', 'mahkeme', 'kanun', 'ukrayna', 'rusya', 'komisyon', 'fetö', 'faiz', 'piyasa', 'banka', 'politik', 'toplantı', 'çiftçi',
                    'saray', 'demokrasi', 'faşis', 'kemal', 'rejim', 'özgürlük', 'koalisyon', 'egemen', 'kurultay', 'danıştay', 'davutoğlu', 'birleşmiş milletler',
                    'bahçeli', 'diplomasi', 'cem uzan', 'lgbt', 'seçim', '2023', 'ibb', 'cemaat', 'soylu', 'liberal', 'kapital', 'protesto', 'halk']
    
    entities_in_text = [ent for ent in list_of_entities if ent.lower() in text.lower()]
    number_entities = len(entities_in_text)

    return number_entities

def total_interactions(retweet_count, favorite_count):
    total_num_interactions = retweet_count + favorite_count
    
    return total_num_interactions



### 1.1.2. Collect data using the functions above and transform into a Pandas DataFrame

In [10]:
dfPolitical = {'tweet_id':[],
              'is_retweet':[],
              'retweeted_username':[],
              'text':[],
              'num_mentions':[],
              'num_hashtags':[],
              'num_retweets':[],
              'num_favorites':[],
              'user_id':[],
              'user_screen_name':[],
              'user_description':[],
              'user_verified':[],
              'user_follower':[],
              'user_following':[],
              'user_num_of_tweets':[],
              'num_political_entities':[],
              'total_interactions':[],
              }


with gzip.open(f"{PATH_TO_DOWNLOADED}tweet_metadata.jsons.gz", "rb") as f:
    for line in f:
        line = json.loads(line)
        
        # raw data:
        id_str = get_tweet_id(line)
        is_retweet, retweeted_username = check_if_retweet(line)
        text = get_tweet_text(line)
        num_mentions, num_hashtags = get_number_mentions_hashtags(line)
        retweet_count, favorite_count = get_number_retweets_favorites(line)
        user_id_str, screen_name, user_description, user_verified, user_follower, user_following, user_num_of_tweets = get_user_info(line)

        # manually crafted data:
        num_political_entities = check_political_ent(text)
        total_num_interactions = total_interactions(retweet_count, favorite_count)

        dfPolitical['tweet_id'].append(id_str)
        dfPolitical['is_retweet'].append(is_retweet)
        dfPolitical['retweeted_username'].append(retweeted_username)
        dfPolitical['text'].append(text)
        dfPolitical['num_mentions'].append(num_mentions)
        dfPolitical['num_hashtags'].append(num_hashtags)
        dfPolitical['num_retweets'].append(retweet_count)
        dfPolitical['num_favorites'].append(favorite_count)
        dfPolitical['user_id'].append(user_id_str)
        dfPolitical['user_screen_name'].append(screen_name)
        dfPolitical['user_description'].append(user_description)
        dfPolitical['user_verified'].append(user_verified)
        dfPolitical['user_follower'].append(user_follower)
        dfPolitical['user_following'].append(user_following)
        dfPolitical['user_num_of_tweets'].append(user_num_of_tweets)
        dfPolitical['num_political_entities'].append(num_political_entities)
        dfPolitical['total_interactions'].append(total_num_interactions)
        

In [11]:
dfPolitical = pd.DataFrame(dfPolitical)
dfPolitical

Unnamed: 0,tweet_id,is_retweet,retweeted_username,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,user_screen_name,user_description,user_verified,user_follower,user_following,user_num_of_tweets,num_political_entities,total_interactions
0,1588568792984346624,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,False,284,539,2638,1,147
1,1588452263047069697,0,,"@mahirunal Gavur İzmir ya onlar, hani Cumhuriy...",1,0,0,0,595514060,mtfdan,,False,131,589,6647,1,0
2,1569589330544398336,0,,#ŞehitAdayıUzmÇvşaKadro\nSiz İstesenizde Istem...,0,1,0,0,1356375754561490947,ahsucilginuzman,Vatan Sevdalisi,False,60,118,2924,2,0
3,1570428119609139201,0,,@ajans_muhbir Siz kaypak olmayıp onay vermesey...,1,0,0,0,1478775431008595968,hamitelkelle,HighOne,False,2,69,1783,1,0
4,1551163840368414722,0,,Engelli öğretmenler olarak önümüzdeki engeller...,0,0,0,0,1511976696337113088,sed58417690,,False,119,166,7559,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33528,1568595408233832448,0,,Gerçek kimlik taşımayan hesaplara cevap vermem...,0,0,9,81,576247173,ardanzenturk,RT ONAYLADIĞIM ANLAMINA GELMEZ\nArtık fikirler...,False,171399,4147,41342,0,90
33529,1584027427696959488,0,,@umitozdag Neden Suriyelilerle ilgili bu kadar...,1,0,1,8,162308585,ozgul_61,Bridge design engineer Yaay hesabı : dilfiruz,False,3936,3227,52340,3,9
33530,1585945783307730945,0,,@celebimehmeta Niye Türkiye yüzyılıda.Türkiye ...,1,0,0,1,415025519,ladrekova,,False,121,412,1225,2,1
33531,1569748909521801221,1,muazzezeralp,RT @muazzezeralp: @Doan58213655 @denizkonur @N...,7,1,6,0,1442125177727307781,yapikytgrivrlsn,,False,591,1003,153819,2,6


## 1.2. From Users

### 1.2.1. Get user metadata from user_profiles.jsons.gz

#### 1.2.1.1. Get user info metadata

In [12]:
def get_user_info_metadata(user_metadata_line):
    
    user_id = user_metadata_line['id_str']
    user_name = user_metadata_line['name']
    user_screen_name = user_metadata_line['screen_name'].lower()
    user_location = user_metadata_line['location']
    user_description = user_metadata_line['description']
    user_followers_count = user_metadata_line['followers_count']
    user_friends_count = user_metadata_line['friends_count']
    user_is_verified = user_metadata_line['verified']
    user_tweet_num = user_metadata_line['statuses_count']
    user_created_at = user_metadata_line['created_at']
    last4_created_at = int(user_created_at[-4:])
    if ((2022 - last4_created_at)*365 == 0):
      tweet_per_day = 0
    else:
      tweet_per_day = round(int(user_tweet_num) / ((2022 - last4_created_at)*365))
    user_favorites_count = user_metadata_line['favourites_count']
    user_is_translate = user_metadata_line['is_translation_enabled']
    user_extended_profile = user_metadata_line['has_extended_profile']
    
    dictionary = {'user_id':user_id, 'user_name': user_name, 'user_screen_name':user_screen_name, 'user_location':user_location,
     'user_description':user_description, 'user_followers_count':user_followers_count, 'user_friends_count':user_friends_count, 'user_is_verified':user_is_verified,
     'user_tweet_num':user_tweet_num, 'last4_created_at':last4_created_at, 'tweet_per_day':tweet_per_day, 'user_favorites_count':user_favorites_count, 'user_is_translate':user_is_translate, 
     'user_extended_profile':user_extended_profile}

    return dictionary

#### 1.2.1.2. Get followers/(followers+friends) ratio

In [13]:
def get_followers_all_ratio(user_followers_count, user_friends_count):
    
    if user_friends_count + user_followers_count == 0:
        followers_all_ratio = 0

    else:
        followers_all_ratio =  user_followers_count / (user_friends_count + user_followers_count)

    return followers_all_ratio
def get_desc_len(user_description):
    
    description_len = len(user_description)

    return description_len

In [15]:
dfBot = {'user_id':[],
         'user_name':[],
         'user_screen_name':[],
         'user_location':[],
         'user_description':[],
         'user_followers_count':[],
         'user_friends_count':[],
         'description_len':[],
         'followers_to_all_ratio':[],
         'user_is_verified':[],
         'user_tweet_num':[],
         'last4_created_at':[],
         'tweet_per_day':[],
         'user_favorites_count':[],
         'user_is_translate':[],
         'user_extended_profile':[]}

with gzip.open(f"{PATH_TO_DOWNLOADED}user_profiles.jsons.gz", "rb") as f:
    for line in f:
        line = json.loads(line)

        dictionary = get_user_info_metadata(line)
        for k,v in dictionary.items():
            dfBot[k].append(v)

        
        # manually crafted data:
        description_len = get_desc_len(dictionary['user_description'])
        dfBot['description_len'].append(description_len)
        
        followers_all_ratio = get_followers_all_ratio(dictionary['user_followers_count'], 
                                                      dictionary['user_friends_count'])

        dfBot['followers_to_all_ratio'].append(followers_all_ratio)

In [16]:
dfBot = pd.DataFrame(dfBot)
dfBot

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,user_is_verified,user_tweet_num,last4_created_at,tweet_per_day,user_favorites_count,user_is_translate,user_extended_profile
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260000,False,2551,2021,7,17676,False,True
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732260,False,42771,2020,59,15474,False,True
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,64,0.192308,False,14300,2019,13,18220,False,True
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,65,0.325203,False,21303,2016,10,26999,False,False
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,23,0.505051,False,1629,2013,0,2179,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29665,1320834618220781569,islammm,islam_mrsj,,,229,217,0,0.513453,False,2396,2020,3,10820,False,False
29666,111074128,Melda Onur,meldaonur,,"Şekersiz çay, etsiz sofra, SAVAŞSIZ dünya... 🐌...",212457,5428,100,0.975088,False,75178,2010,17,36671,False,True
29667,36946875,ali ydm,ali_ydm,"İstanbul, Türkiye",hayat oyunsa bende jeton çok,116,141,28,0.451362,False,6482,2009,1,7389,False,False
29668,2389587396,Türkan Usta,turkanusta,"Ankara, Türkiye",Ustaya sormuşlar; hayatta yaptığın en büyük is...,1669,3639,116,0.314431,False,121113,2014,41,140095,False,True


### 1.2.2. Get Tweet Info of Users in user_profiles.jsons.gz

#### 1.2.2.1. Check ratio of retweets to all tweets

In [17]:
def get_retweet_tweet_ratio(line):
    number_retweets = 0
    number_original_tweets = 0

    for tweet in line['tweets']:
        try:
            tweet['retweeted_status']
            number_retweets += 1
                
        except:
            number_original_tweets += 1
            
    total_tweets = number_retweets + number_original_tweets
    
    if total_tweets == 0:
        retweet_total_ratio = None
    else:
        retweet_total_ratio = number_retweets/(total_tweets)
    
    return retweet_total_ratio, total_tweets

def get_median_number_favorites(line):
    num_median_favorites = np.median([tweet['favorite_count'] for tweet in line['tweets']])

    return num_median_favorites



### 1.2.3. Collect data using the functions above and transform into a Pandas DataFrame

In [18]:
dfBotTweets = {'user_id':[],
               'retweet_total_ratio':[],
               'num_median_favorites':[],
               'num_of_tweets':[]}

i = 0

with gzip.open(f"{PATH_TO_DOWNLOADED}user_tweets.jsons.gz", "rb") as f:
    for line in f:

        line = json.loads(line)

        user_id = line['user_id']
        dfBotTweets['user_id'].append(user_id)
        
        retweet_total_ratio, num_of_tweets = get_retweet_tweet_ratio(line)
        dfBotTweets['retweet_total_ratio'].append(retweet_total_ratio)
        dfBotTweets['num_of_tweets'].append(num_of_tweets)
        
        num_median_favorites = get_median_number_favorites(line)
        dfBotTweets['num_median_favorites'].append(num_median_favorites)

        i += 1
        
        if i % 10000 == 0:
            print(i)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


10000
20000


In [19]:
dfBotTweets = pd.DataFrame(dfBotTweets)
dfBotTweets

Unnamed: 0,user_id,retweet_total_ratio,num_median_favorites,num_of_tweets
0,594642154,0.115000,2.0,200
1,525600289,0.005025,1.0,199
2,931895965501534209,0.900000,0.0,200
3,1591543462746329088,0.185000,0.0,200
4,734801354749796352,1.000000,0.0,200
...,...,...,...,...
28310,1591370361488252928,0.800000,0.0,200
28311,1475272459616235525,0.825000,0.0,200
28312,1096753792731750401,0.051020,1.0,196
28313,1269527617687953409,0.095000,2.0,200


### 1.2.3. Merge dfBot and dfBotTweets

In [20]:
dfBotAll = dfBot.merge(dfBotTweets,
                       how='left')

dfBotAll[['retweet_total_ratio', 'num_median_favorites']] = dfBotAll[['retweet_total_ratio', 'num_median_favorites']].fillna(dfBotAll[['retweet_total_ratio', 'num_median_favorites']].median())

dfBotAll

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,user_is_verified,user_tweet_num,last4_created_at,tweet_per_day,user_favorites_count,user_is_translate,user_extended_profile,retweet_total_ratio,num_median_favorites,num_of_tweets
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260000,False,2551,2021,7,17676,False,True,0.395939,0.0,197.0
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732260,False,42771,2020,59,15474,False,True,0.125000,0.0,200.0
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,64,0.192308,False,14300,2019,13,18220,False,True,0.910000,0.0,200.0
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,65,0.325203,False,21303,2016,10,26999,False,False,0.015306,1.0,196.0
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,23,0.505051,False,1629,2013,0,2179,False,False,0.659898,0.0,197.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29665,1320834618220781569,islammm,islam_mrsj,,,229,217,0,0.513453,False,2396,2020,3,10820,False,False,0.015000,1.0,200.0
29666,111074128,Melda Onur,meldaonur,,"Şekersiz çay, etsiz sofra, SAVAŞSIZ dünya... 🐌...",212457,5428,100,0.975088,False,75178,2010,17,36671,False,True,0.291457,2.0,199.0
29667,36946875,ali ydm,ali_ydm,"İstanbul, Türkiye",hayat oyunsa bende jeton çok,116,141,28,0.451362,False,6482,2009,1,7389,False,False,0.061538,0.0,195.0
29668,2389587396,Türkan Usta,turkanusta,"Ankara, Türkiye",Ustaya sormuşlar; hayatta yaptığın en büyük is...,1669,3639,116,0.314431,False,121113,2014,41,140095,False,True,0.995000,0.0,200.0


# 2. TRAIN MODEL

## 2.1. Political Tweet Prediction

### 2.1.1. Merge dfPolitical data with labels

In [21]:
dfPoliticalAll_train = dfPolitical.merge(trainingTweetDf,
                                         on='tweet_id')
dfPoliticalAll_train['user_verified'] = dfPoliticalAll_train['user_verified']*1
dfPoliticalAll_train.head()

Unnamed: 0,tweet_id,is_retweet,retweeted_username,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,user_screen_name,user_description,user_verified,user_follower,user_following,user_num_of_tweets,num_political_entities,total_interactions,isPolitical
0,1585955683513798656,0,,@AvOzlemZengin YüzüncüYıla YakışanGenelAf adli...,1,0,3,2,1564992353168941058,zehra78231638,,0,221,113,25730,0,5,Yes
1,1597631718479261696,0,,#TCYüzyılıÜcretliÖgrtKadro\n#TCYüzyılıÜcretliÖ...,0,2,30,28,1324630334416297985,nurozguler,,0,455,420,12427,0,58,Yes
2,1572522789948751874,0,,Ekrem İmamoğlu davayı değerlendirdi. 'Boş işle...,0,0,5,66,407597071,onediocom,Türkiye'nin ilk ve tek sosyal içerik sitesi ht...,1,735643,12,161123,0,71,Yes
3,1591412481561624577,0,,Sayın Bakanım @suleymansoylu POMEM önlisans er...,1,0,0,0,1394789887073738753,buckybarnestr,...,0,10,45,1595,3,0,Yes
4,1596914274907348992,0,,"@varank Sayın bakanım, Bodrumdaki bu araziyi ...",1,0,0,0,1586083256088371201,sayariahmet,,0,6,6,1251,1,0,Yes


### 2.1.2. Separate X and y values
We only use 3 features here to create a baseline model. However, it is not enough to get good results.

In [22]:
#X = dfPoliticalAll_train[['num_political_entities']]

X = dfPoliticalAll_train[['num_mentions','num_hashtags','num_retweets','num_political_entities','total_interactions']]
y = dfPoliticalAll_train['isPolitical'].apply(lambda x: 1 if x=='Yes' else 0)

### 2.1.3. Train - validation split

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

### 2.1.4. Train the model

Here, you may use different models such as neural networks, XGBoost, AdaBoost, RandomForest, Linear Regression, Logistic Regression etc. to see which model does the best. Also, you can use grid_search_cv() or a basic for loop to optimize the hyperparameters of your model.

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# create an instance
dtc_political = DecisionTreeClassifier()
param_grid = {'criterion':['gini','entropy'], 'max_depth':[3,5,7,9,11]}
grid = GridSearchCV(dtc_political,
                    param_grid, 
                    cv=5,
                    scoring='precision',
                    return_train_score=False, 
                    verbose=1, 
                    refit=True)


# fit your model
grid_search = grid.fit(X_train, y_train)

# make predictions
preds = grid_search.predict(X_valid)

# evaluate on validation set
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
MSE: 0.24333333333333335 
 Accuracy Score: 0.7566666666666667 
 Confusion Matrix: 
 [[144  65]
 [ 81 310]]


In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# create an instance
rfc_political = RandomForestClassifier()
param_grid = {'criterion':['gini', 'entropy'], 'max_depth':[3,5,7,9,11]}
grid = GridSearchCV(rfc_political,
                    param_grid, 
                    cv=5,
                    scoring='precision',
                    return_train_score=False, 
                    verbose=1, 
                    refit=True)


# fit your model
grid_search = grid.fit(X_train, y_train)

# make predictions
preds = grid_search.predict(X_valid)

# evaluate on validation set
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
MSE: 0.23666666666666666 
 Accuracy Score: 0.7633333333333333 
 Confusion Matrix: 
 [[ 99 110]
 [ 32 359]]


In [26]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# create an instance
rfc_political = GradientBoostingClassifier(n_estimators=100)
param_grid = {'max_depth':[3,5,7,9,11]}
grid = GridSearchCV(rfc_political,
                    param_grid, 
                    cv=5,
                    scoring='precision',
                    return_train_score=False, 
                    verbose=1, 
                    refit=True)


# fit your model
grid_search = grid.fit(X_train, y_train)

# make predictions
preds = grid_search.predict(X_valid)

# evaluate on validation set
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
MSE: 0.23166666666666666 
 Accuracy Score: 0.7683333333333333 
 Confusion Matrix: 
 [[100 109]
 [ 30 361]]


In [27]:
import keras
import keras.utils
from tensorflow.keras import utils as np_utils
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.optimizers import SGD, Adam
import tensorflow as tf
from tensorflow.keras import regularizers


In [28]:
X_dfPoliticalAll_train = X
y_dfPoliticalAll_train = y

from sklearn.preprocessing import MinMaxScaler
msc = MinMaxScaler(feature_range=(0, 1))
scaledP_Xtrain = msc.fit_transform(X_dfPoliticalAll_train)
scaledP_Xtrain_df = pd.DataFrame(scaledP_Xtrain, columns=X_dfPoliticalAll_train.columns.values)
scaledP_ytrain_df = y_dfPoliticalAll_train.astype(float)

pX_train, pX_valid, py_train, py_valid = train_test_split(scaledP_Xtrain_df, scaledP_ytrain_df, test_size=0.20, random_state=42)

In [29]:
from sklearn.model_selection import KFold
num_folds = 5

# create an instance of the KFold class
kf = KFold(n_splits=num_folds)


model_1 = tf.keras.Sequential()
model_1.add(tf.keras.layers.Dense(100, activation='relu', name='hidden_layer_1'))
model_1.add(tf.keras.layers.Dense(50, activation='relu', name='hidden_layer_2'))
model_1.add(tf.keras.layers.Dense(25, activation='relu', name='hidden_layer_3'))
model_1.add(tf.keras.layers.Dense(1, name='output_layer'))

# compile your model with an optimizer

model_1.compile(loss='mean_squared_error', optimizer = Adam(learning_rate=0.001), metrics=['mse','accuracy'])

pX = pX_train.values
pY = py_train.values

# initialize an array to store the training and validation scores
scores = []

# loop through the folds
for train_index, val_index in kf.split(pX, pY):
    # get the training and validation data for this fold
    pX_train, pX_val = pX[train_index], pX[val_index]
    pY_train, pY_val = pY[train_index], pY[val_index]
    
    # fit the model on the training data
    model_1.fit(pX_train, pY_train, epochs=35, batch_size=64, shuffle=True, verbose=1)
    
    # evaluate the model on the validation data
    val_loss, val_mse, val_acc = model_1.evaluate(pX_val, pY_val, verbose=0)
    
    # store the validation score for this fold
    scores.append(val_loss)

# print the average validation score
print("Average validation score:", sum(scores)/len(scores))

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 1

In [32]:
## last model_1 --> for political

from sklearn.model_selection import KFold
num_folds = 5

# create an instance of the KFold class
kf = KFold(n_splits=num_folds)


model_1 = tf.keras.Sequential()
model_1.add(tf.keras.layers.Dense(100, activation='relu', name='hidden_layer_1',kernel_regularizer=regularizers.l2(0.001)))
model_1.add(Dropout(0.5))
model_1.add(tf.keras.layers.Dense(50, activation='relu', name='hidden_layer_2',kernel_regularizer=regularizers.l2(0.001)))
model_1.add(Dropout(0.5))
model_1.add(tf.keras.layers.Dense(25, activation='relu', name='hidden_layer_3',kernel_regularizer=regularizers.l2(0.001)))
model_1.add(Dropout(0.5))
model_1.add(tf.keras.layers.Dense(1, name='output_layer'))

# compile your model with an optimizer

model_1.compile(loss='mean_squared_error', optimizer = Adam(learning_rate=0.01), metrics=['mse','accuracy'])

pX = pX_train.values
pY = py_train.values

# initialize an array to store the training and validation scores
scores = []

# loop through the folds
for train_index, val_index in kf.split(pX, pY):
    # get the training and validation data for this fold
    pX_train, pX_val = pX[train_index], pX[val_index]
    pY_train, pY_val = pY[train_index], pY[val_index]
    
    # fit the model on the training data
    model_1.fit(pX_train, pY_train, epochs=35, batch_size=64, shuffle=True, verbose=1)
    
    # evaluate the model on the validation data
    val_loss, val_mse, val_acc = model_1.evaluate(pX_val, pY_val, verbose=0)
    
    # store the validation score for this fold
    scores.append(val_loss)

# print the average validation score
print("Average validation score:", sum(scores)/len(scores))

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 1

In [33]:
model_1.evaluate(pX_valid,py_valid)



[0.19307644665241241, 0.18465301394462585, 0.746666669845581]

## 2.2. Bot Detection

### 2.2.1. Merge dfBotAll data with labels

In [34]:
dfBotAll.user_screen_name = dfBotAll.user_screen_name.str.lower()

In [35]:
dfBotAll_train = dfBotAll.merge(trainingUserDf,
                               left_on='user_screen_name',
                               right_on='screen_name')

dfBotAll_train['user_is_verified'] = dfBotAll_train['user_is_verified']

dfBotAll_train

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,user_is_verified,user_tweet_num,last4_created_at,tweet_per_day,user_favorites_count,user_is_translate,user_extended_profile,retweet_total_ratio,num_median_favorites,num_of_tweets,screen_name,isBot
0,1512081815292432394,sezgin,sezgin953116371,,,46,430,0,0.096639,False,1535,2022,0,9627,False,True,0.050251,0.0,199.0,sezgin953116371,No
1,1425452291428077571,Adem Koç,gogoadem61,,,14,171,0,0.075676,False,113,2021,0,74,False,True,0.761062,0.0,113.0,gogoadem61,No
2,328164303,Necmettin Balıkçı,dewil511,,,21,49,0,0.300000,False,219,2011,0,25,False,False,0.010101,0.0,198.0,dewil511,Yes
3,1343666971368431622,Night Bird⁷🦉,midnight__bird,,"La vie est un sommeil, l’amour en est le rêve...",422,260,48,0.618768,False,15191,2020,21,84933,False,True,0.085000,1.0,200.0,midnight__bird,No
4,1240932880488038400,Samed Pınarcı,samedpinarci,,Orman Mühendisi - Orman İşletme Şefi - Orman G...,133,202,60,0.397015,False,3734,2020,5,15783,False,True,0.780000,0.0,200.0,samedpinarci,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,1286770207134973954,Hamide Arabacı,anka6054,,,151,61,0,0.712264,False,5288,2020,7,5347,False,True,0.000000,1.0,200.0,anka6054,No
2996,1598032338323214338,atamabekleyenbahceci,atamabekleyenzz,,,173,367,0,0.320370,False,311,2022,0,196,False,True,0.580000,0.0,200.0,atamabekleyenzz,No
2997,760235343966863360,Emrah İNCİ,memrahinci,Istanbul - Bayburt,Researcher | Middle East | Political Science |...,5863,5905,71,0.498215,False,1029,2016,0,725,False,True,0.040000,36.0,200.0,memrahinci,No
2998,1553973684100124672,Murat Kkk,muratkkk18,,Normal sıradan bir insanım,1,10,26,0.090909,False,18,2022,0,38,False,True,0.769231,0.0,13.0,muratkkk18,No


In [36]:
trainingUserDf.isBot.value_counts()

No     2424
Yes     576
Name: isBot, dtype: int64

### 2.2.2. Separate X and y values
We use only 4 features here to create a baseline model. However, it is not enough to get good results.

In [37]:
X = dfBotAll_train[['user_followers_count', 'user_friends_count', 'followers_to_all_ratio', 'user_is_verified', 'user_tweet_num', 'tweet_per_day', 'user_favorites_count', 'user_extended_profile', 'retweet_total_ratio', 'num_median_favorites']]
y = dfBotAll_train.isBot.apply(lambda x: 1 if x=='Yes' else 0)


### 2.2.3. Train-test split

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

### 2.2.4. Train the model

In [39]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# create an instance
dtc_bot = DecisionTreeClassifier()

# fit your model
dtc_bot.fit(X_train, y_train)

# make predictions
preds = dtc_bot.predict(X_valid)

# evaluate on validation set
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.23166666666666666 
 Accuracy Score: 0.75 
 Confusion Matrix: 
 [[401  92]
 [ 58  49]]


In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# create an instance
rfc_bot = RandomForestClassifier()
param_grid = {'criterion':['gini', 'entropy'], 'max_depth':[3,5,7,9,11]}
grid = GridSearchCV(rfc_bot,
                    param_grid, 
                    cv=5,
                    scoring='precision',
                    return_train_score=False, 
                    verbose=1, 
                    refit=True)


# fit your model
grid_search = grid.fit(X_train, y_train)

# make predictions
preds = grid_search.predict(X_valid)

# evaluate on validation set
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


MSE: 0.16833333333333333 
 Accuracy Score: 0.8316666666666667 
 Confusion Matrix: 
 [[489   4]
 [ 97  10]]


In [41]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# create an instance
rfc_bot = GradientBoostingClassifier(n_estimators=100)
param_grid = {'max_depth':[3,5,7,9,11]}
grid = GridSearchCV(rfc_bot,
                    param_grid, 
                    cv=5,
                    scoring='precision',
                    return_train_score=False,
                    verbose=1, 
                    refit=True)


# fit your model
grid_search = grid.fit(X_train, y_train)

# make predictions
preds = grid_search.predict(X_valid)

# evaluate on validation set
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
MSE: 0.16666666666666666 
 Accuracy Score: 0.8333333333333334 
 Confusion Matrix: 
 [[466  27]
 [ 73  34]]


In [44]:
from sklearn.model_selection import KFold
num_folds = 5

# create an instance of the KFold class
kf = KFold(n_splits=num_folds)


model_2 = tf.keras.Sequential()
model_2.add(tf.keras.layers.Dense(100, activation='relu', name='hidden_layer_1'))
model_2.add(tf.keras.layers.Dense(50, activation='relu', name='hidden_layer_2'))
model_2.add(tf.keras.layers.Dense(25, activation='relu', name='hidden_layer_3'))
model_2.add(tf.keras.layers.Dense(1, name='output_layer'))

# compile your model with an optimizer

model_2.compile(loss='mean_squared_error', optimizer = Adam(learning_rate=0.001), metrics=['mse','accuracy'])

pX = pX_train.values
pY = py_train.values

# initialize an array to store the training and validation scores
scores = []

# loop through the folds
for train_index, val_index in kf.split(pX, pY):
    # get the training and validation data for this fold
    pX_train, pX_val = pX[train_index], pX[val_index]
    pY_train, pY_val = pY[train_index], pY[val_index]
    
    # fit the model on the training data
    model_2.fit(pX_train, pY_train, epochs=35, batch_size=64, shuffle=True, verbose=1)
    
    # evaluate the model on the validation data
    val_loss, val_mse, val_acc = model_2.evaluate(pX_val, pY_val, verbose=0)
    
    # store the validation score for this fold
    scores.append(val_loss)

# print the average validation score
print("Average validation score:", sum(scores)/len(scores))

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 1

In [45]:
X_dfPoliticalAll_train = X
y_dfPoliticalAll_train = y

from sklearn.preprocessing import MinMaxScaler
msc = MinMaxScaler(feature_range=(0, 1))
scaledP_Xtrain = msc.fit_transform(X_dfPoliticalAll_train)
scaledP_Xtrain_df = pd.DataFrame(scaledP_Xtrain, columns=X_dfPoliticalAll_train.columns.values)
scaledP_ytrain_df = y_dfPoliticalAll_train.astype(float)

pX_train, pX_valid, py_train, py_valid = train_test_split(scaledP_Xtrain_df, scaledP_ytrain_df, test_size=0.20, random_state=42)

In [46]:
## last model_2 --> for bot

from sklearn.model_selection import KFold
num_folds = 5

# create an instance of the KFold class
kf = KFold(n_splits=num_folds)


model_2 = tf.keras.Sequential()
model_2.add(tf.keras.layers.Dense(100, activation='relu', name='hidden_layer_1',kernel_regularizer=regularizers.l2(0.001)))
model_2.add(Dropout(0.5))
model_2.add(tf.keras.layers.Dense(50, activation='relu', name='hidden_layer_2',kernel_regularizer=regularizers.l2(0.001)))
model_2.add(Dropout(0.5))
model_2.add(tf.keras.layers.Dense(25, activation='relu', name='hidden_layer_3',kernel_regularizer=regularizers.l2(0.001)))
model_2.add(Dropout(0.5))
model_2.add(tf.keras.layers.Dense(1, name='output_layer'))

# compile your model with an optimizer

model_2.compile(loss='mean_squared_error', optimizer = Adam(learning_rate=0.01), metrics=['mse','accuracy'])

pX = pX_train.values
pY = py_train.values

# initialize an array to store the training and validation scores
scores = []

# loop through the folds
for train_index, val_index in kf.split(pX, pY):
    # get the training and validation data for this fold
    pX_train, pX_val = pX[train_index], pX[val_index]
    pY_train, pY_val = pY[train_index], pY[val_index]
    
    # fit the model on the training data
    model_2.fit(pX_train, pY_train, epochs=35, batch_size=64, shuffle=True, verbose=1)
    
    # evaluate the model on the validation data
    val_loss, val_mse, val_acc = model_2.evaluate(pX_val, pY_val, verbose=0)
    
    # store the validation score for this fold
    scores.append(val_loss)

# print the average validation score
print("Average validation score:", sum(scores)/len(scores))

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 1

In [47]:
model_2.evaluate(pX_valid,py_valid)



[0.14022254943847656, 0.13663634657859802, 0.8216666579246521]

# 3. MAKE PREDICTIONS

Here, you will make predictions with the models that you have trained above.

## 3.1. Predictions for Tweets (Political or Not)

In [48]:
# read the evaluation file as follows
evaluationTweetDf = pd.read_csv('{}evaluation-round3-tweet.csv'.format(DATA_PATH), dtype={'tweet_id': str, 'isPolitical': str}, header=None, names=['tweet_id'])
evaluationTweetDf = evaluationTweetDf.dropna()
evaluationTweetDf

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfPolitical_test = dfPolitical.merge(evaluationTweetDf)
evaluationTweetDf['tweet_id'] = pd.to_numeric(evaluationTweetDf['tweet_id'])

# define X as we did above in section (2.x.2. Separate X and y values)
X = dfPolitical_test[['num_mentions','num_hashtags','num_retweets','num_political_entities','total_interactions']]

X_dfPolitical_test = X

msc = MinMaxScaler(feature_range=(0, 1))
scaledpt_Xtest = msc.fit_transform(X_dfPolitical_test)
scaledpt_Xtest_df = pd.DataFrame(scaledpt_Xtest, columns=X_dfPolitical_test.columns.values)

# make predictions based on these variables
predictions_political = model_1.predict(scaledpt_Xtest_df)



### This part is important! We expect you to return your predictions in the following format:

In [49]:
modelPredTweet = dict([(x,float(y)) for x,y in zip([*dfPolitical_test.tweet_id], predictions_political)])
#modelPredTweet = {x: 0.0 if y < 0.5001 else 1.0
#           for (x,y) in modelPredTweet.items()}

values = modelPredTweet.values()
min_ = min(values)
max_ = max(values)
modelPredTweet = {key: ((v-min_)/(max_-min_)) for (key,v) in modelPredTweet.items()}

modelPredTweet

{'1434787703783051264': 0.0029369887989404756,
 '1367571642604544000': 0.006277100795077144,
 '1589993032975544320': 1.0,
 '1565312596135354373': 1.0,
 '1579558096833511424': 1.0,
 '1439547067337256967': 0.0029366273345179726,
 '1559963768372740098': 0.002939428683792369,
 '1562853131251118081': 0.0013857642297699206,
 '1586021183958704128': 0.0029369887989404756,
 '1585766233491886081': 0.0029369887989404756,
 '1427746815420604417': 0.001383956907657407,
 '1352635736537882629': 0.002936356236201096,
 '1415032260571680768': 0.0029369887989404756,
 '1548636597628899328': 1.0,
 '1564926450096013313': 0.001384770202608038,
 '1585634359612420101': 1.0,
 '1597138789108895744': 0.32865899590966857,
 '1391681495622995971': 0.0013846798365024125,
 '1389951943343316995': 0.002936808066729224,
 '1452348722810138646': 0.0044891170291672874,
 '1595829502021623812': 0.48937240378178537,
 '1413108476348354562': 0.002936808066729224,
 '1579408398894137344': 0.48937240378178537,
 '1570758749606019073'

## 3.2. Predictions for Users (Bot or Not)

In [50]:
evaluationUserDf = pd.read_csv('{}evaluation-round3-user.csv'.format(DATA_PATH), dtype={0: str}, header=None, names=['user_screen_name'])
evaluationUserDf = evaluationUserDf.dropna()

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfBot_test = dfBotAll.merge(evaluationUserDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X = dfBot_test[['user_followers_count', 'user_friends_count', 'followers_to_all_ratio', 'user_is_verified', 'user_tweet_num', 'tweet_per_day', 'user_favorites_count', 'user_extended_profile', 'retweet_total_ratio', 'num_median_favorites']]

X['user_is_verified'] = X['user_is_verified']*1
X['user_extended_profile'] = X['user_extended_profile']*1
X_dfBot_test = X

msc = MinMaxScaler(feature_range=(0, 1))
scaledt_Xtest = msc.fit_transform(X_dfBot_test)
scaledt_Xtest_df = pd.DataFrame(scaledt_Xtest, columns=X_dfBot_test.columns.values)

# make predictions based on these variables
predictions_bot = model_2.predict(scaledt_Xtest_df)
 




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['user_is_verified'] = X['user_is_verified']*1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['user_extended_profile'] = X['user_extended_profile']*1




In [51]:
modelPredUser = dict([(x,float(y)) for x,y in zip([*dfBot_test.user_screen_name], predictions_bot)])
#modelPredUser = {x: 0.0 if y < 0.2500 else 1.0
#           for (x,y) in modelPredUser.items()}

values = modelPredUser.values()
min_ = min(values)
max_ = max(values)
modelPredUser = {key: ((v-min_)/(max_-min_)) for (key,v) in modelPredUser.items()}

modelPredUser

{'biologselim': 0.11521206145793458,
 'omerakdag34': 0.0,
 'bilgin21604923': 0.030873145745979134,
 '_sydneycarton_': 0.0,
 'denizlihabercom': 0.0,
 'burakerbaychp': 0.006789257192269879,
 'mvnez': 0.0,
 'qara118': 0.0,
 'nabiyonyevrum': 0.0,
 'farukhalit2': 0.0035964610364256685,
 'harlunoshi': 0.0,
 'heritagepaix': 0.0,
 'nuranwolf': 0.0,
 'politikgundem': 0.0,
 'isakethudax': 0.0,
 'enveraysevera': 0.0,
 'ilaydejaneiro': 0.00866383793994211,
 '1905anason': 0.24458091637319365,
 'eraydurgut03': 0.025804638372094842,
 'dasiskein': 0.0,
 'ercan_bas29': 0.01913555887585331,
 'mett_1907': 0.0,
 'ondemir066': 0.7199049222245083,
 'semihyeteer': 0.0007106310650822226,
 'haberinyokcokk': 0.011453838807795515,
 'meleky_ozaydin': 0.14738724818989954,
 'mehmetaltay64': 0.007438234142674752,
 'nurtencam2': 0.035034928925588786,
 'bilobi4': 0.009933095285260284,
 'yorumsuzadam87': 0.03265876717162749,
 'twitsildiren': 0.0,
 '21gramlife1': 0.007855812504706669,
 'cakan0_': 0.0,
 'oguzksalici': 0.

# PREPARE SUBMISSION

You will need to submit exact same file produced by using the following code. Any deviation from the desired format willbe marked as 0.

In [52]:
# Explain your approach

data_explanations = '''
\nWe trained our political data with 4 features and bot data with 10 features. We specifically not include text, description of tweet and user_id to training data.\nBasically the training data include numerical and categorical attributes, not verbal ones. 
'''

feature_explanations = '''\nWe added some additional feautures for a better ML model. user_is_verified returns with a boolean value. States whether the user is verified by twitter or not.\ntweet_per_day finds average tweet per day.\n
'''

model_explanations ='''
In the previous round we used CNN model however it gave us bad results due to overfitting. Then we decide to avoid overfitting by using k fold cross validation and dropout method in keras library. Now we expect that our model does not overfit. We have 3 layer and 0.2 test size. At the end for each prediction( political and bot) we get the average of each epoch (epoch = 30). Lastly we use Adam optimizer to get better results.
'''

additional_explanations = '''
Any other tricks that you tried for the project
'''


In [53]:
predictions = {
    'round': 3,
    'student_id': 28146,
    'user_predictions': modelPredUser,
    'tweet_predictions': modelPredTweet,
    'explanations': {
        'data': data_explanations,
        'feature': feature_explanations,
        'model': model_explanations,
        'other': additional_explanations,
    }
}


with open('predictions-{}_round{}.json'.format(STUDENT_ID, ROUND), 'w') as fl:
    fl.write(json.dumps(predictions, indent=4))

In [54]:
# Test your submission file
ROUND = 3
submission = json.load(open('predictions-{}_round{}.json'.format(STUDENT_ID, ROUND), 'r'))
submission

{'round': 3,
 'student_id': 28146,
 'user_predictions': {'biologselim': 0.11521206145793458,
  'omerakdag34': 0.0,
  'bilgin21604923': 0.030873145745979134,
  '_sydneycarton_': 0.0,
  'denizlihabercom': 0.0,
  'burakerbaychp': 0.006789257192269879,
  'mvnez': 0.0,
  'qara118': 0.0,
  'nabiyonyevrum': 0.0,
  'farukhalit2': 0.0035964610364256685,
  'harlunoshi': 0.0,
  'heritagepaix': 0.0,
  'nuranwolf': 0.0,
  'politikgundem': 0.0,
  'isakethudax': 0.0,
  'enveraysevera': 0.0,
  'ilaydejaneiro': 0.00866383793994211,
  '1905anason': 0.24458091637319365,
  'eraydurgut03': 0.025804638372094842,
  'dasiskein': 0.0,
  'ercan_bas29': 0.01913555887585331,
  'mett_1907': 0.0,
  'ondemir066': 0.7199049222245083,
  'semihyeteer': 0.0007106310650822226,
  'haberinyokcokk': 0.011453838807795515,
  'meleky_ozaydin': 0.14738724818989954,
  'mehmetaltay64': 0.007438234142674752,
  'nurtencam2': 0.035034928925588786,
  'bilobi4': 0.009933095285260284,
  'yorumsuzadam87': 0.03265876717162749,
  'twitsil