In [17]:
from statistics import mean, stdev
from pandas import read_csv, DataFrame, concat

LOG_FILE = 'access_log_format-1.csv'

def read_log(file_name: str) -> DataFrame:
    ds = read_csv(file_name, parse_dates=[3], infer_datetime_format=True)
    ds['datetime'] = ds['datetime'].sort_index()
    return ds


def set_suspicious_tag(df: DataFrame) -> DataFrame:
    ''' Filters 200 status OK and sets the suspicious tag'''
    ok_status_df = df[df['response_code'] == 200]
    date_ranges = ok_status_df.resample('30min', on='datetime').\
                        size().to_frame().reset_index().rename(columns={0:'count'})
    g_mean = mean(date_ranges['count'])
    g_stdev = stdev(date_ranges['count'])
    labeled_dataset = DataFrame()
    for i in range(1, date_ranges.shape[0]+1):
        temp_df = None
        init_dt = None
        end_dt = None
        try:
            init_dt = date_ranges['datetime'][i-1]
            end_dt = date_ranges['datetime'][i]
        except Exception:
            end_dt = df['datetime'][df.shape[0]-1]
            
        temp_df = ok_status_df[(ok_status_df['datetime'] > init_dt) & (ok_status_df['datetime'] <= end_dt)]
        ok_count = date_ranges['count'][i-1]
        tag = None
        if ok_count > g_mean:
            if ((ok_count/g_stdev) < 3):
                tag = 'suspicious'
            else:
                tag = 'high-suspicious'
        else:
            tag = 'normal' 
        temp_df.insert(temp_df.shape[1], 's_tag', [tag] * temp_df.shape[0])
        labeled_dataset = concat([labeled_dataset, temp_df])
    return labeled_dataset


def get_proba1(df: DataFrame, src_ip: str, user: str, country: str, s_tag: str) -> float:
    ''' Function that returns the probability of a src_ip, user and country ocurrs'''
    df_org_shape = df.shape[0]
    df = df[df['s_tag'] == s_tag]
    df = df[df['src_ip'] == src_ip]
    df = df[df['username'] == user]
    df = df[df['country'] == country]
    df_final_shape = df.shape[0]
    del df
    return df_final_shape/df_org_shape


def get_proba2(df: DataFrame, src_ip: str, user: str, country: str) -> float:
    ''' Function that returns the probability of a src_ip, user and country ocurrs'''
    df_org_shape = df.shape[0]
    df = df[df['src_ip'] == src_ip]
    df = df[df['username'] == user]
    df = df[df['country'] == country]
    df_final_shape = df.shape[0]
    del df
    return df_final_shape/df_org_shape


def set_wrong_class_proba(df: DataFrame) -> DataFrame:
    ''' Sets the probability of a suspicious tag is wrong'''
    ALPHA = 0.99850757
    BETA = 1
    wrong_proba = list()
    for i in df.index.tolist():
        proba1 = get_proba1(df, df['src_ip'][i], df['username'][i], df['country'][i], df['s_tag'][i])
        proba2 =get_proba2(df, df['src_ip'][i], df['username'][i], df['country'][i])
        f_proba = (proba1+ALPHA)/(proba2+BETA)
        wrong_proba.append(f_proba)
    df.insert(df.shape[1], 'proba_wrong_s_tag', wrong_proba)
    return df

Now we filter 200 status ok and set de suspicious tag in the dataframe

In [18]:

df = read_log(LOG_FILE)
df = set_suspicious_tag(df)

Then set if the previous classification is wrong

In [19]:
df = set_wrong_class_proba(df)
# df.to_csv('Q3.csv',index=False)
print(df)

              src_ip     userid                 username            datetime  \
0     14.139.187.130     hahiss     hahiss@optonline.net 2017-01-01 02:16:51   
1     14.139.187.130   ahuillet     ahuillet@comcast.net 2017-01-01 02:16:55   
2     14.139.187.130    gtaylor        gtaylor@gmail.com 2017-01-01 02:16:56   
3     68.180.228.229    terjesa    terjesa@sbcglobal.net 2017-01-01 02:17:59   
4     68.180.228.229  smallpaul  smallpaul@optonline.net 2017-01-01 02:17:59   
...              ...        ...                      ...                 ...   
7206  54.234.104.161      yumpy      yumpy@optonline.net 2017-01-04 02:52:03   
7222  54.234.104.161      houle        houle@outlook.com 2017-01-04 02:52:03   
7225  85.159.196.242  peterhoeg    peterhoeg@hotmail.com 2017-01-04 02:54:53   
7226  85.159.196.242   wikinerd        wikinerd@live.com 2017-01-04 02:54:56   
7228  163.172.65.131     munjal           munjal@aol.com 2017-01-04 02:59:20   

                                       