## Jupyter Notebook purpose is to test prepared, trained model based on most important Twitter user profile's features:
*   followers count
*   tweet count
*   following count
*   account age (in days)
*   length of description

-----
-----

#### Install and import libs

In [1]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!pip install gdown

In [3]:
import gdown
import json
from datetime import datetime
import pandas as pd
from keras.models import load_model
import ipywidgets as widgets
from IPython.display import display

#### Compile prepared functions

##### Function counts days different between days

In [4]:
def cal_days_diff(a,b):
    return (a - b).days

##### Function standardize dataframe columns based on predefined training set mean and std

In [5]:
def standardize_features(df, standardization_values):
  for column_name in list(standardization_values.keys()):
    mean_training = standardization_values[column_name][0]
    std_training = standardization_values[column_name][1]
    if column_name in df.columns:
      df[column_name] = (df[column_name] - mean_training) / std_training

  return df

##### Function extracts, preprocesses user's account data and packs into dataframe

In [6]:
def extract_import_features_to_df(user_data):
  followers_count	= 0
  following_count	= 0
  tweet_count = 0
  account_age	= 0
  descr_len	= 0
  if 'public_metrics' in user_data and user_data['public_metrics']:
    if 'followers_count' in user_data['public_metrics'] and user_data['public_metrics']['followers_count']:
      followers_count	= user_data['public_metrics']['followers_count']
    if 'tweet_count' in user_data['public_metrics'] and user_data['public_metrics']['tweet_count']:
      tweet_count	= user_data['public_metrics']['tweet_count']
    if 'following_count' in user_data['public_metrics'] and user_data['public_metrics']['following_count']:
      following_count	= user_data['public_metrics']['following_count']
  if 'created_at' in user_data and user_data['created_at']:
    initial_date_format = "%Y-%m-%d %H:%M:%S%z"
    account_creation_datetime = datetime.strptime(user_data['created_at'], initial_date_format).date()
    account_age	= cal_days_diff(datetime.today().date(), account_creation_datetime)
  if 'description' in user_data and user_data['description']:
    descr_len	= len(user_data['description'])

  df = pd.DataFrame({
    "followers_count" : [followers_count],
    "tweet_count" : [tweet_count],
    "following_count" : [following_count],
    "account_age" : [account_age],
    "descr_len" : [descr_len]
  })
  return df

##### Predefined mean and std use for standardization

In [7]:
# mean and std of training set for each important feature
standardization_values = {  "followers_count" : (6208.796717535272, 44145.842669630874),
                            "tweet_count" : (6595.194212496401, 33286.69197462025),
                            "following_count" : (1259.0381514540743, 6144.786564521689),
                            "account_age" : (2443.6521739130435, 1640.571178505393),
                            "descr_len" : (84.77094730780306, 59.6092442340293)
                          }

##### Load example from file

In [8]:
def load_example(json_file='bot_1.json'):
  user_data_path = example_destination_dir + json_file
  data = {}
  with open(user_data_path, 'r') as f:
    data = json.load(f)
  return data

##### Prepare json file uploader

In [9]:
uploader = widgets.FileUpload(
    accept='.json',
    multiple=False
)

-------
-------

# Twitter user classification

-------
-------

## Download model end example user data

In [10]:
content_dir = '/content'
model_destination_dir = content_dir + '/model/'
example_destination_dir = content_dir + '/example_user_data/'

In [None]:
model_url = 'https://drive.google.com/file/d/14AltsvxlFXKXCnG81vUIIjgptad_z3X8/view?usp=sharing'
gdown.download(url=model_url, output='model.zip', quiet=False, fuzzy=True)

example_user_data_url = 'https://drive.google.com/file/d/1dkJWEOGr7uaXnwiHY8GdsjBJHLYnHH7h/view?usp=sharing'
gdown.download(url=example_user_data_url, output='example_user_data.zip', quiet=False, fuzzy=True)

In [None]:
!rm -rf "$model_destination_dir"
!rm -rf "$example_destination_dir"

!mkdir -p "$model_destination_dir"
!mkdir -p "$example_destination_dir"

!unzip /content/model.zip -d model/
!unzip /content/example_user_data.zip -d example_user_data/

## Load model

In [13]:
model_path = model_destination_dir + 'model.hdf5'
model = load_model(model_path)

## Load user data
#### <u>**Important!** Run one of cells with prepared example data or load your own json file with user data</u>

Json should have official API Twitter scheme format and consist required user data extended by number of tweets. Rest of user details is not revelant for model.

<u>Example (shorten version with important fields only and 'id'):</u>

```
{
  "description": "The Real Twitter API. Tweets about API changes, service  issues and our Developer Platform. Don't get an answer? It's on my website.",
  "created_at": "Wed May 23 06:01:13 +0000 2007",
  "followers_count": 6133636,
  "following": 7979,
  "tweets_count": 534
}
```



#### Example 1 (bot user)

In [14]:
user_data = load_example(json_file='bot_1.json')

#### Example 2 (human user)

In [15]:
user_data = load_example(json_file='human_1.json')

#### Example 3 (bot user)

In [16]:
user_data = load_example(json_file='bot_2.json')

#### Example 4 (human user)

In [17]:
user_data = load_example(json_file='human_2.json')

#### Load own user data to classify

In [None]:
display(uploader)

In [51]:
file_name = list(uploader.value)[0]
data_bytes = uploader.value[file_name]['content']
data_str = bytes.decode(data_bytes, 'utf-8')
user_data = json.loads(data_str)

------

## Prepare data for model input

In [52]:
user_data_df = extract_import_features_to_df(user_data)
user_data_df = standardize_features(user_data_df, standardization_values)

## Classify user

In [53]:
output = model.predict(user_data_df, verbose=0)

## Classification result

In [None]:
label = 'BOT' if output[0][0] >= 0.5 else 'HUMAN'
print(('Probability of user being a bot: {}.').format(output[0][0]))
print(('Analyzed user is classified as a {}.').format(label))

