# Import data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv('../data/pipeline/1550_rows_46_columns_with_modelScores.csv')

In [2]:
# Get date difference between 'CreationDate' and 'LastActivityDate' or 'LastEditDate'
import re
from datetime import datetime
def last_date(row):
    pattern = '^\d{4}[-]\d{2}[-]\d{2}'
    if not pd.isna(row['LastActivityDate']) :
        matched = bool(re.match(pattern, row['LastActivityDate']))
        if matched == True:
            datetime_object = datetime.strptime(row['LastActivityDate'],  '%Y-%m-%dT%H:%M:%S.%f')
            return (datetime_object-row['CreationDate']).days
        else:
            # Use LastEditDate
            if not pd.isna(row['LastEditDate']):
                matched_edit = bool(re.match(pattern, row['LastEditDate']))
                if matched_edit == True:
                    datetime_object = datetime.strptime(row['LastEditDate'],  '%Y-%m-%dT%H:%M:%S.%f')
                    return (datetime_object-row['CreationDate']).days
            
    diff = (datetime.today()-row['CreationDate']).days
    return diff

df[['CreationDate']] = df[['CreationDate']].apply(pd.to_datetime)
df['Date_diff'] = df.apply(lambda row : last_date(row), axis = 1) 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1550 entries, 0 to 1549
Data columns (total 49 columns):
Unnamed: 0                     1550 non-null int64
Unnamed: 0.1                   1550 non-null int64
AnswerCount                    1298 non-null float64
ClosedDate                     29 non-null object
CommentCount                   955 non-null object
CommentCount_ans               343 non-null object
CommentCount_que               941 non-null float64
ContentLicense                 953 non-null object
CreationDate                   1550 non-null datetime64[ns]
CreationDate_ans               343 non-null object
CreationDate_que               1298 non-null object
FavoriteCount                  462 non-null float64
Id                             1550 non-null int64
Id_ans                         343 non-null float64
Id_que                         1298 non-null float64
LastActivityDate               955 non-null object
LastActivityDate_ans           343 non-null float64
LastActiv

In [4]:
checked_df = df[df['outdated (manually checked)'].isnull()==False]
checked_df.shape

(187, 49)

## Target

In [5]:
checked_df['outdated (manually checked)'].value_counts()

True     130
False     57
Name: outdated (manually checked), dtype: int64




## Features

In [6]:
list(checked_df.columns)

['Unnamed: 0',
 'Unnamed: 0.1',
 'AnswerCount',
 'ClosedDate',
 'CommentCount',
 'CommentCount_ans',
 'CommentCount_que',
 'ContentLicense',
 'CreationDate',
 'CreationDate_ans',
 'CreationDate_que',
 'FavoriteCount',
 'Id',
 'Id_ans',
 'Id_que',
 'LastActivityDate',
 'LastActivityDate_ans',
 'LastActivityDate_que',
 'LastEditDate',
 'OwnerDisplayName',
 'OwnerUserId',
 'ParentId',
 'PostId',
 'PostTypeId',
 'Score',
 'Score_ans',
 'Score_que',
 'Tags',
 'Text',
 'Title',
 'UserId',
 'ViewCount',
 'cnt_keywords',
 'from_dataset',
 'include_irrel',
 'negative_statement',
 'of_answer',
 'punctuation',
 'sentence',
 'subj_irrel',
 'subject',
 'share_link',
 'aws_related_tags',
 'share_code',
 'outdated (manually checked)',
 'is_training',
 'keywords',
 'model_score',
 'Date_diff']

In [7]:
#Only take useful features
features = ['AnswerCount','CommentCount', 'CommentCount_que', 'FavoriteCount', 'Score', 'Score_que', 'ViewCount', 'cnt_keywords',
       'include_irrel', 'negative_statement', 'of_answer', 'subj_irrel', 'share_link', 'share_code', 'model_score', 'Date_diff']
X_train = checked_df[features]
y = checked_df['outdated (manually checked)']

In [8]:
y_train = y.astype('bool')

# Preprocessing and baseline model

In [9]:
# train test split
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [10]:
# preprocess
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer

cont = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())

categorical = df[features].dtypes == bool

preprocess = make_column_transformer(
    (cont, ~categorical), remainder = 'passthrough')

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
scoring = ['accuracy', 'precision', 'recall','f1','roc_auc']

scores_lr = cross_validate(make_pipeline(preprocess, LogisticRegression()), X_train, y_train, cv=10,
                        scoring=scoring)

print('accuracy score: ', np.mean(scores_lr['test_accuracy']))
print('f1 score: ', np.mean(scores_lr['test_f1']))
print('precision score: ', np.mean(scores_lr['test_precision']))
print('recall score: ', np.mean(scores_lr['test_recall']))
print('roc_auc score: ', np.mean(scores_lr['test_roc_auc']))

accuracy score:  0.8353801169590642
f1 score:  0.8820226937813146
precision score:  0.8685626873126873
recall score:  0.9
roc_auc score:  0.8047435897435898


# Other models

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([("preprocessor", preprocess),
                 ("clf", RandomForestClassifier())])

param_grid={'clf__max_features': ['sqrt','log2'],
            'clf__n_estimators': (10, 50, 100)}

grid = GridSearchCV(pipe, param_grid=param_grid, scoring='accuracy', cv=5,
                    n_jobs=-1, return_train_score=True)

grid_result =grid.fit(X_train, y_train)

best_params = grid_result.best_params_

rfc = RandomForestClassifier(max_features=best_params['clf__max_features'], 
                            n_estimators=best_params['clf__n_estimators'], 
                            random_state=False, verbose=False)

scores_rfc = cross_validate(make_pipeline(preprocess, rfc), X_train, y_train, cv=10, scoring=scoring)
                        
print('accuracy score: ', np.mean(scores_rfc['test_accuracy']))
print('f1 score: ', np.mean(scores_rfc['test_f1']))
print('precision score: ', np.mean(scores_rfc['test_precision']))
print('recall score: ', np.mean(scores_rfc['test_recall']))
print('roc_auc score: ', np.mean(scores_rfc['test_roc_auc']))

accuracy score:  0.8453216374269005
f1 score:  0.8907283342455757
precision score:  0.865551948051948
recall score:  0.9230769230769231
roc_auc score:  0.7937179487179488


In [12]:
from sklearn.svm import SVC

pipe = Pipeline([("preprocessor", preprocess),
                 ("clf", SVC())])

param_grid={'clf__C': (0.001, 0.01, 0.1, 1, 10),
            'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'clf__gamma': (0.001, 0.01, 0.1, 1)}

grid = GridSearchCV(pipe, param_grid=param_grid, scoring='accuracy', cv=5,
                    n_jobs=-1, return_train_score=True)

grid_result =grid.fit(X_train, y_train)

best_params = grid_result.best_params_

svc = SVC(C=best_params['clf__C'],  
          kernel = best_params['clf__kernel'],
          gamma = best_params['clf__gamma'],
          random_state=False, verbose=False)

scores_svc = cross_validate(make_pipeline(preprocess, svc), X_train, y_train, cv=10, scoring=scoring)
                        
print('accuracy score: ', np.mean(scores_svc['test_accuracy']))
print('f1 score: ', np.mean(scores_svc['test_f1']))
print('precision score: ', np.mean(scores_svc['test_precision']))
print('recall score: ', np.mean(scores_svc['test_recall']))
print('roc_auc score: ', np.mean(scores_svc['test_roc_auc']))

accuracy score:  0.8827485380116957
f1 score:  0.9208173690932313
precision score:  0.8723214285714285
recall score:  0.976923076923077
roc_auc score:  0.8314102564102563


In [13]:
from sklearn.linear_model import SGDClassifier

pipe = Pipeline([("preprocessor", preprocess),
                 ("clf", SGDClassifier())])

param_grid={'clf__penalty': ['l1', 'l2'],
            'clf__alpha': np.arange(0.0001, 0.01)}

grid = GridSearchCV(pipe, param_grid=param_grid, scoring='accuracy', cv=5,
                    n_jobs=-1, return_train_score=True)

grid_result =grid.fit(X_train, y_train)

best_params = grid_result.best_params_

sdg = SGDClassifier(penalty=best_params['clf__penalty'],  
          alpha = best_params['clf__alpha'],
          random_state=False, verbose=False)

scores_sdg = cross_validate(make_pipeline(preprocess, sdg), X_train, y_train, cv=10, scoring=scoring)
                        
print('accuracy score: ', np.mean(scores_sdg['test_accuracy']))
print('f1 score: ', np.mean(scores_sdg['test_f1']))
print('precision score: ', np.mean(scores_sdg['test_precision']))
print('recall score: ', np.mean(scores_sdg['test_recall']))
print('roc_auc score: ', np.mean(scores_sdg['test_roc_auc']))

accuracy score:  0.7710526315789473
f1 score:  0.8299246591600415
precision score:  0.8429395604395603
recall score:  0.8307692307692308
roc_auc score:  0.7673076923076924


In [14]:
from sklearn.neighbors import KNeighborsClassifier

pipe = Pipeline([("preprocessor", preprocess),
                 ("clf", KNeighborsClassifier())])

param_grid={'clf__n_neighbors': (3, 5, 10),
            'clf__weights': ['uniform', 'distance']}

grid = GridSearchCV(pipe, param_grid=param_grid, scoring='accuracy', cv=5,
                    n_jobs=-1, return_train_score=True)

grid_result =grid.fit(X_train, y_train)

best_params = grid_result.best_params_

knn = KNeighborsClassifier(n_neighbors = best_params['clf__n_neighbors'],  
          weights = best_params['clf__weights'])

scores_knn = cross_validate(make_pipeline(preprocess, knn), X_train, y_train, cv=10, scoring=scoring)
                        
print('accuracy score: ', np.mean(scores_knn['test_accuracy']))
print('f1 score: ', np.mean(scores_knn['test_f1']))
print('precision score: ', np.mean(scores_knn['test_precision']))
print('recall score: ', np.mean(scores_knn['test_recall']))
print('roc_auc score: ', np.mean(scores_knn['test_roc_auc']))

accuracy score:  0.850877192982456
f1 score:  0.9028808611567232
precision score:  0.8295483193277311
recall score:  0.9923076923076923
roc_auc score:  0.7971794871794872


## Conclusion

SVC has the best performance.

# Feature selection

In [15]:
X_train_n = preprocess.fit_transform(X_train)

In [16]:
from sklearn.feature_selection import SelectFromModel
sfm = SelectFromModel(rfc).fit(X_train_n,y_train)
print(sfm.transform(X_train_n).shape)

(187, 3)


In [17]:
numeric_cols = np.array(X_train.select_dtypes(exclude='bool').columns)
bool_cols = np.array(X_train.select_dtypes(include='bool').columns)
all_columns = np.hstack([numeric_cols, bool_cols])

In [18]:
all_columns[sfm.get_support()]

array(['ViewCount', 'model_score', 'Date_diff'], dtype=object)

In [19]:
X_train_useful = X_train_n[:, sfm.get_support()]
X_train_useful.shape

(187, 3)

In [20]:
scores_rfc_2 = cross_validate(rfc, X_train_useful, y_train, cv=10, scoring=scoring)
                        
print('accuracy score: ', np.mean(scores_rfc_2['test_accuracy']))
print('f1 score: ', np.mean(scores_rfc_2['test_f1']))
print('precision score: ', np.mean(scores_rfc_2['test_precision']))
print('recall score: ', np.mean(scores_rfc_2['test_recall']))
print('roc_auc score: ', np.mean(scores_rfc_2['test_roc_auc']))

accuracy score:  0.8230994152046783
f1 score:  0.8679855585027999
precision score:  0.8724775224775225
recall score:  0.8769230769230768
roc_auc score:  0.8525641025641028


Although the accuracy score was improved after select features, we noticed that the recall is 1 now, so we chose not to use feature selection.



# Prediction

In [21]:
X_pre = df[features]
X_pre.shape

(1550, 16)

In [22]:
def clean_CommentCount(row):
    if not pd.isna(row['CommentCount']):
        matched = bool(re.match(r"\b[A-Z]", row['CommentCount']))
        pattern = '^\d{4}[-]\d{2}[-]\d{2}'
        matched2 = bool(re.match(pattern, row['CommentCount']))
        if matched == True or matched2==True:
            return 0
        else:
            return row['CommentCount']
    return 0

df_new = X_pre.copy()  
df_new['CommentCount_new'] = df_new.apply(lambda row : clean_CommentCount(row), axis = 1) 
X_pre = X_pre.assign(CommentCount=df_new['CommentCount_new'])

In [23]:
pipe_SVC = Pipeline([("preprocessor", preprocess),
                 ("clf", svc)])
pipe_SVC.fit(X_train,y_train)
res = pipe_SVC.predict(X_pre)

In [24]:
pipe_rfc = Pipeline([("preprocessor", preprocess),
                 ("clf", rfc)])
pipe_rfc.fit(X_train,y_train)
res_rfc = pipe_rfc.predict(X_pre)

In [26]:
df['predict'] = res

In [27]:
df.to_csv('../data/pipeline/1550_with_predict_label.csv')