In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

import xgboost
# !pip install catboost
# import catboost





In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# check xgboost version
print(xgboost.__version__)

1.7.6


In [4]:
train_data = pd.read_csv('/content/drive/MyDrive/Doceree hackathon/Dataset/Doceree-HCP_Train.csv',encoding='unicode_escape', on_bad_lines='skip')
test_data = pd.read_csv('/content/drive/MyDrive/Doceree hackathon/Dataset/Doceree-HCP_Test.csv',encoding='unicode_escape', on_bad_lines='skip')


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113937 entries, 0 to 113936
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   ID               113937 non-null  int64  
 1   DEVICETYPE       113937 non-null  object 
 2   PLATFORM_ID      113937 non-null  int64  
 3   BIDREQUESTIP     113937 non-null  object 
 4   USERPLATFORMUID  113933 non-null  object 
 5   USERCITY         107578 non-null  object 
 6   USERZIPCODE      109345 non-null  float64
 7   USERAGENT        113935 non-null  object 
 8   PLATFORMTYPE     113937 non-null  object 
 9   CHANNELTYPE      113937 non-null  object 
 10  URL              113937 non-null  object 
 11  KEYWORDS         113937 non-null  object 
 12  TAXONOMY         32313 non-null   object 
 13  IS_HCP           113936 non-null  float64
dtypes: float64(2), int64(2), object(10)
memory usage: 12.2+ MB


In [6]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28493 entries, 0 to 28492
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               28493 non-null  int64  
 1   DEVICETYPE       28493 non-null  object 
 2   PLATFORM_ID      28493 non-null  int64  
 3   BIDREQUESTIP     28493 non-null  object 
 4   USERPLATFORMUID  28493 non-null  object 
 5   USERCITY         26934 non-null  object 
 6   USERZIPCODE      27378 non-null  float64
 7   USERAGENT        28492 non-null  object 
 8   PLATFORMTYPE     28493 non-null  object 
 9   CHANNELTYPE      28493 non-null  object 
 10  URL              28493 non-null  object 
 11  KEYWORDS         28493 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 2.6+ MB


In [7]:
print(train_data.shape, test_data.shape)

(113937, 14) (28493, 12)


In [8]:
train_data['IS_HCP'].isnull().sum()

1

In [9]:
train_data.loc[train_data["IS_HCP"].isnull()]

# Calculating the outliers
Q1 = train_data['IS_HCP'].quantile(0.25)
Q3 = train_data['IS_HCP'].quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - 1.5*IQR
upper_limit = Q3 + 1.5*IQR

# Removing the outliers
train_data = train_data[(train_data['IS_HCP'] > lower_limit) & (train_data['IS_HCP'] < upper_limit)]

In [10]:
train_data.isnull().sum()

ID                     0
DEVICETYPE             0
PLATFORM_ID            0
BIDREQUESTIP           0
USERPLATFORMUID        4
USERCITY            6359
USERZIPCODE         4592
USERAGENT              2
PLATFORMTYPE           0
CHANNELTYPE            0
URL                    0
KEYWORDS               0
TAXONOMY           81623
IS_HCP                 0
dtype: int64

In [11]:

# Encode categorical columns
categorical_cols = ["DEVICETYPE", "USERPLATFORMUID", "USERCITY", "USERAGENT", "PLATFORMTYPE", "CHANNELTYPE"]
label_encoder = LabelEncoder()
for col in categorical_cols:
    if col in train_data.columns:
        train_data[col] = label_encoder.fit_transform(train_data[col])

In [12]:
train_data.columns

Index(['ID', 'DEVICETYPE', 'PLATFORM_ID', 'BIDREQUESTIP', 'USERPLATFORMUID',
       'USERCITY', 'USERZIPCODE', 'USERAGENT', 'PLATFORMTYPE', 'CHANNELTYPE',
       'URL', 'KEYWORDS', 'TAXONOMY', 'IS_HCP'],
      dtype='object')

In [13]:
# Function to convert IP to int
def ip_to_int(ip_str):
    octets = ip_str.split('.')
    return int(octets[0]) << 24 | int(octets[1]) << 16 | int(octets[2]) << 8 | int(octets[3])

In [14]:
train_data['BIDREQUESTIP']

0           170.173.0.22
1          65.216.253.25
2           66.232.79.22
3         137.54.125.246
4         174.202.231.99
               ...      
113932      68.82.97.126
113933    104.172.11.109
113934     174.21.94.113
113935    69.253.129.131
113936    108.41.233.175
Name: BIDREQUESTIP, Length: 113936, dtype: object

In [15]:
# Convert IP addresses to integers
train_data['BIDREQUESTIP'] = train_data['BIDREQUESTIP'].apply(ip_to_int)

In [16]:
train_data.isnull().sum()

ID                     0
DEVICETYPE             0
PLATFORM_ID            0
BIDREQUESTIP           0
USERPLATFORMUID        0
USERCITY               0
USERZIPCODE         4592
USERAGENT              0
PLATFORMTYPE           0
CHANNELTYPE            0
URL                    0
KEYWORDS               0
TAXONOMY           81623
IS_HCP                 0
dtype: int64

In [17]:
# from geopy.geocoders import Nominatim
# geolocator = Nominatim(user_agent="geocoder")

# for index, row in temp.iterrows():
#     if pd.isnull(row["USERCITY"]):
#         zipcode = row["USERZIPCODE"]
#         location = geolocator.geocode(zipcode)
#         if location is not None:
#             address = location.address
#             address_parts = address.split(",")
#             if len(address_parts) >= 3:
#                 city = address_parts[-3].strip()
#                 temp.at[index, "USERCITY"] = city

In [18]:
# Drop the URL and TAXONOMY column
if 'URL' in train_data.columns:
    train_data = train_data.drop(columns=['URL'])
if 'TAXONOMY' in train_data.columns:
    train_data = train_data.drop(columns=['TAXONOMY'])

In [19]:
# Vectorize 'KEYWORDS' column
vectorizer = TfidfVectorizer()
vectorizer.fit(train_data['KEYWORDS'])
vector = vectorizer.transform(train_data['KEYWORDS'])
tfidf_data = pd.DataFrame(vector.toarray(), columns=vectorizer.get_feature_names_out())

In [20]:
# Drop the original 'KEYWORDS' column and combine with processed text data
train_data = train_data.drop(columns=['KEYWORDS'])
train_data = pd.concat([train_data, tfidf_data], axis=1)

In [21]:
# Split data into features and target variable
X = train_data.drop(columns=['ID', 'IS_HCP'])
y = train_data['IS_HCP']  # Assuming 'IS_HCP' is the target variable

In [22]:
y.isnull().sum()

1

In [23]:


# Drop rows where target variable is NaN
X = X[~y.isna()]
y = y.dropna()

In [24]:
# Set the ID as the index
train_data.set_index('ID', inplace=True)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# # Train a classifier
# clf = HistGradientBoostingClassifier()
# clf.fit(X_train, y_train)

In [25]:
xg = xgboost.XGBRegressor(n_estimators=200, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
xg.fit(X_train,y_train)
# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# # evaluate model
# scores = cross_val_score(xg, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

In [26]:
# Make predictions on validation set
y_pred = xg.predict(X_val)
y_pred

array([ 0.702589  , -0.00139002,  0.50005496, ...,  0.8769418 ,
        0.918324  ,  0.00093863], dtype=float32)

In [30]:
import numpy as np
yp = np.rint(y_pred)
yp

array([ 1., -0.,  1., ...,  1.,  1.,  0.], dtype=float32)

In [31]:
np.unique(yp)

array([0., 1.], dtype=float32)

In [32]:

# Evaluate the model
accuracy = accuracy_score(y_val, yp)
print(f"Model accuracy: {accuracy}")


Model accuracy: 0.9405827628576444


In [33]:
# Create a DataFrame with IDs and predictions
predictions_df = pd.DataFrame({'ID': X_val.index, 'IS_HCP': y_pred})

# Save the DataFrame to a CSV file
predictions_df.to_csv('predictions.csv', index=False)

In [34]:
train_data.shape

(113937, 1256)

In [35]:
##TEST DATA
test_data.head()

Unnamed: 0,ID,DEVICETYPE,PLATFORM_ID,BIDREQUESTIP,USERPLATFORMUID,USERCITY,USERZIPCODE,USERAGENT,PLATFORMTYPE,CHANNELTYPE,URL,KEYWORDS
0,115501,Desktop,2,75.189.231.103,0d5041ff-f0b6-4d1a-9ad7-0a29f7d485b4,Fayetteville,28305.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6...,Online Medical Journal,Website,https://www.clinicaladvisor.com/home/features/...,Family Practice|Drainage|Clinical|Dermatology|...
1,115502,Mobile,2,24.101.33.158,c8396dd0-969f-4d99-a40b-b7bb1f516154,Conneaut Lake,16316.0,Mozilla/5.0 (iPhone; CPU iPhone OS 15_6_1 like...,Online Medical Journal,Website,https://www.ophthalmologyadvisor.com/topics/ca...,General|Clinical|Operative|Medicine|Cardiology...
2,115503,Desktop,2,172.118.216.142,3c97a081-6518-43f8-9f26-369759cfb471,Covina,91724.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Online Medical Journal,Website,https://www.psychiatryadvisor.com/author/tori-...,Abortion|Anxiety Disorders|Apnea|False|Trauma|...
3,115504,Desktop,7,71.105.120.171,3e2578c8-f794-41af-a38c-c5cfb3c0f014,Brooklyn,11226.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Online Medical Journal,Website,https://www.cureus.com/articles/105482-diverti...,Health|Male|Neurological Surgery|Otolaryngolog...
4,115505,Desktop,2,73.82.211.73,ec2ae7ce-6a8c-4156-98a7-07203e60f483,Marietta,30062.0,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Online Medical Journal,Website,https://www.renalandurologynews.com/home/confe...,chronic kidney disease|pain|nephrology|disease...


In [None]:
test_data.isnull().sum()

ID                    0
DEVICETYPE            0
PLATFORM_ID           0
BIDREQUESTIP          0
USERPLATFORMUID       0
USERCITY           1559
USERZIPCODE        1115
USERAGENT             1
PLATFORMTYPE          0
CHANNELTYPE           0
URL                   0
KEYWORDS              0
dtype: int64

In [39]:
df_test = pd.DataFrame(test_data)
df_test

Unnamed: 0,ID,DEVICETYPE,PLATFORM_ID,BIDREQUESTIP,USERPLATFORMUID,USERCITY,USERZIPCODE,USERAGENT,PLATFORMTYPE,CHANNELTYPE,URL,KEYWORDS
0,115501,Desktop,2,75.189.231.103,0d5041ff-f0b6-4d1a-9ad7-0a29f7d485b4,Fayetteville,28305.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6...,Online Medical Journal,Website,https://www.clinicaladvisor.com/home/features/...,Family Practice|Drainage|Clinical|Dermatology|...
1,115502,Mobile,2,24.101.33.158,c8396dd0-969f-4d99-a40b-b7bb1f516154,Conneaut Lake,16316.0,Mozilla/5.0 (iPhone; CPU iPhone OS 15_6_1 like...,Online Medical Journal,Website,https://www.ophthalmologyadvisor.com/topics/ca...,General|Clinical|Operative|Medicine|Cardiology...
2,115503,Desktop,2,172.118.216.142,3c97a081-6518-43f8-9f26-369759cfb471,Covina,91724.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Online Medical Journal,Website,https://www.psychiatryadvisor.com/author/tori-...,Abortion|Anxiety Disorders|Apnea|False|Trauma|...
3,115504,Desktop,7,71.105.120.171,3e2578c8-f794-41af-a38c-c5cfb3c0f014,Brooklyn,11226.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Online Medical Journal,Website,https://www.cureus.com/articles/105482-diverti...,Health|Male|Neurological Surgery|Otolaryngolog...
4,115505,Desktop,2,73.82.211.73,ec2ae7ce-6a8c-4156-98a7-07203e60f483,Marietta,30062.0,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Online Medical Journal,Website,https://www.renalandurologynews.com/home/confe...,chronic kidney disease|pain|nephrology|disease...
...,...,...,...,...,...,...,...,...,...,...,...,...
28488,143989,Desktop,2,69.202.233.241,78ce4bbe-3885-4c14-b945-a0ea1e4574f4,Brooklyn,11215.0,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,Online Medical Journal,Website,https://www.renalandurologynews.com/home/news/...,Transplantation|Psychiatry|Heart Failure|Angio...
28489,143990,Desktop,7,75.4.190.65,a53799e1-b279-40cf-b467-11b4d659e325,Miami,33178.0,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,Online Medical Journal,Website,https://www.cureus.com/publish/articles/144818...,small|hospitals|nephrology|emergency medicine|...
28490,143991,Desktop,7,137.52.180.45,acc732a8-2ee9-4e2c-9933-70e34db48101,Fort Lauderdale,33314.0,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Online Medical Journal,Website,https://www.cureus.com/registrations/continue?...,small|hospitals|nephrology|emergency medicine|...
28491,143992,Desktop,8,66.249.66.4,dce21294-b105-4abb-b145-4e62d71def44,,,Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Bu...,Online Medical Journal,Website,https://radrounds.com/radiology-case-images-te...,the|disease|pain|radiology|lung abscess|dyspar...


In [40]:
# Encode categorical columns
for col in categorical_cols:
    if col in df_test.columns:
        df_test[col] = label_encoder.fit_transform(df_test[col])

In [41]:
# Convert IP addresses to integers
df_test['BIDREQUESTIP'] = df_test['BIDREQUESTIP'].apply(ip_to_int)

# Drop the URL and TAXONOMY column
if 'URL' in df_test.columns:
    df_test = df_test.drop(columns=['URL'])
if 'TAXONOMY' in df_test.columns:
    df_test = df_test.drop(columns=['TAXONOMY'])

In [42]:
# Vectorize 'KEYWORDS' column
vector = vectorizer.transform(df_test['KEYWORDS'])
tfidf_data = pd.DataFrame(vector.toarray(), columns=vectorizer.get_feature_names_out())

In [43]:
# Drop the original 'KEYWORDS' column and combine with processed text data
df_test = df_test.drop(columns=['KEYWORDS'])
df_test = pd.concat([df_test, tfidf_data], axis=1)

# Dropping IS_HCP
X_test = df_test.drop(columns=['ID'])

In [47]:
# Make predictions on test data
test_predictions = xg.predict(X_test)

In [48]:
test_predictions = np.rint(test_predictions)

In [49]:
# Filter predictions for the specified ID range
start_id = 115501
end_id = 143993
predictions_df = predictions_df[(predictions_df['ID'] >= start_id) & (predictions_df['ID'] <= end_id)]

In [50]:
# Create a mask for IDs in the given range
mask = (df_test['ID'] >= start_id) & (df_test['ID'] <= end_id)

# Filter the test data using the mask
filtered_df_test = df_test[mask]

# Get the features for the filtered test data
X_test = filtered_df_test.drop(columns=['ID'])

In [51]:


# Make predictions on the filtered test data
filtered_test_predictions = xg.predict(X_test)

In [52]:
filtered_test_predictions = np.rint(filtered_test_predictions)

In [53]:
# Save the filtered predictions to a CSV file
filtered_output = pd.DataFrame({"ID": filtered_df_test["ID"], "IS_HCP": filtered_test_predictions})
filtered_output.to_csv("Filtered_Test_Predictions2.csv", index=False)