In [23]:
import time
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import chromedriver_binary
from bs4 import BeautifulSoup
from selenium import webdriver

driver = webdriver.Chrome()
driver.get('https://www.kaggle.com/')

df = pd.read_csv('dataset_raw.csv')
df.head()

Unnamed: 0,title,author_name,author_id,size,file_type,no_of_files,upvotes,medal,usability,date,day,time,dataset_link,img_link
0,Netflix TV Shows and Movies,Victor Soeiro,https://www.kaggle.com/victorsoeiro,2 MB,CSV,2,180.0,Bronze,10.0,5/15/2022,Sun,05:31:23,https://www.kaggle.com/datasets/victorsoeiro/n...,
1,Supermarket store branches sales analysis,Suraj Jha,https://www.kaggle.com/surajjha101,10 kB,CSV,1,154.0,Gold,10.0,4/29/2022,Fri,16:40:16,https://www.kaggle.com/datasets/surajjha101/st...,
2,Student Performance Dataset,Dev Ansodariya,https://www.kaggle.com/devansodariya,7 kB,CSV,1,108.0,Bronze,9.7,5/26/2022,Thu,19:25:09,https://www.kaggle.com/datasets/devansodariya/...,
3,Top Hits Spotify from 2000-2019,Mark Koverha,https://www.kaggle.com/paradisejoy,96 kB,CSV,1,120.0,Silver,10.0,5/31/2022,Tue,12:50:57,https://www.kaggle.com/datasets/paradisejoy/to...,
4,Amex Competition Data in Parquet Format,Sanskar Hasija,https://www.kaggle.com/odins0n,9 GB,other,2,83.0,Gold,10.0,5/26/2022,Thu,04:50:19,https://www.kaggle.com/datasets/odins0n/amex-p...,


### 1. Removing Unwanted Columns

In [24]:
del df['img_link']
del df['medal']
del df['usability']
del df['title']
del df['size']
del df['file_type']
del df['no_of_files']
del df['upvotes']
del df['date']
del df['day']
del df['time']
del df['dataset_link']

df.head()

Unnamed: 0,author_name,author_id
0,Victor Soeiro,https://www.kaggle.com/victorsoeiro
1,Suraj Jha,https://www.kaggle.com/surajjha101
2,Dev Ansodariya,https://www.kaggle.com/devansodariya
3,Mark Koverha,https://www.kaggle.com/paradisejoy
4,Sanskar Hasija,https://www.kaggle.com/odins0n


### 2. Removing Null Values

In [25]:
df = df.dropna()

df.head()

Unnamed: 0,author_name,author_id
0,Victor Soeiro,https://www.kaggle.com/victorsoeiro
1,Suraj Jha,https://www.kaggle.com/surajjha101
2,Dev Ansodariya,https://www.kaggle.com/devansodariya
3,Mark Koverha,https://www.kaggle.com/paradisejoy
4,Sanskar Hasija,https://www.kaggle.com/odins0n


### 3. Checking Null Values

In [26]:
df.isnull().sum()

df.head()

Unnamed: 0,author_name,author_id
0,Victor Soeiro,https://www.kaggle.com/victorsoeiro
1,Suraj Jha,https://www.kaggle.com/surajjha101
2,Dev Ansodariya,https://www.kaggle.com/devansodariya
3,Mark Koverha,https://www.kaggle.com/paradisejoy
4,Sanskar Hasija,https://www.kaggle.com/odins0n


### 4.Defining Function to check Social Media Links

In [27]:
def check_social(social_link):
       
    github   = np.nan
    twitter  = np.nan
    website  = np.nan
    linkedin = np.nan   
    
    for link in social_link:
        
        if ('github' in link):
            github   =  link
            
        elif ('twitter' in link):
            twitter  =  link
            
        elif ('linkedin' in link):
            linkedin =  link
            
        elif (len(social_link) == 4):
            website  =  link
            
    return github, twitter , linkedin, website

### 5. Defining Function to check Kaggle Progress

In [28]:
def check_progress(lst):

    competitions = 0
    dataset      = 0
    codes        = 0
    discussion   = 0
    
    for i in lst:
        if ('Competitions' in i):
            competitions =  int(i[13:-1].replace(',',''))
        elif('Datasets' in i):
            dataset      =  int(i[9:-1].replace(',',''))
        elif('Code' in i):
            codes        =  int(i[5:-1].replace(',',''))
        elif('Discussion' in i):
            discussion   =  int(i[11:-1].replace(',',''))            
    return competitions, dataset, codes, discussion

In [33]:
df = df.drop_duplicates()

### 6. Data Extraction

In [34]:
data = []

driver = webdriver.Chrome()
driver.get('https://www.kaggle.com/')

for link in tqdm(df['author_id']):
    
    followers = 0
    following = 0
    competitions = 0
    dataset      = 0
    codes        = 0
    discussion   = 0

    driver.get(link)
#     time.sleep(1)

    soup = BeautifulSoup(driver.page_source,'html.parser')

    try:
        author_img = soup.find('img', alt = 'image-url').get('src')
    except:
        author_img = np.nan

    try:
        basic_info = soup.find('p', class_ = 'profile__user-location').text.strip()
    except:
        basic_info = np.nan

    try:
        joined_date_time = soup.find('p', class_ = 'profile__user-metadata').find_all('span')[0].get('title')[:-31]
    except:
        joined_date_time = np.nan

    try:
        last_seen = soup.find('p', class_ = 'profile__user-metadata').find_all('span')[1].find('span').text
    except:
        last_seen = np.nan

    try:   
        social_link = soup.find('ul', class_ = 'profile__social-links').find_all('li')
        github, twitter, linkedin, website = check_social([link.find('a').get('href') for link in social_link])
    except:
        social_link = np.nan
        github    = np.nan
        twitter   = np.nan
        linkedin  = np.nan
        website   = np.nan

    try:
        temp = [i.text for i in soup.find_all('div', class_ = 'profile__user-followers-item')]
        for i in temp:
            if ('Followers' in i):
                followers = int(i[9:])
            else:
                following = int(i[9:])
    except:
        followers = 0
        following = 0

    try:
        temp = [i.text for i in soup.find('div', class_ = 'pageheader__nav-wrapper').find_all('a')]
        competitions, dataset, codes, discussion = check_progress(temp)
    except:
        competitions = 0
        dataset      = 0
        codes        = 0
        discussion   = 0


    data.append([link, last_seen, joined_date_time, followers, following ,
                 competitions, dataset, codes, discussion,
                 basic_info, github, twitter, linkedin, website ,author_img])


100%|███████████████████████████████████████| 5652/5652 [37:53<00:00,  2.49it/s]


### 7. Creating DataFrame

In [35]:
df = pd.DataFrame(data, columns = ['link','last_seen','joined_date_time','followers','following',
                                  'competitions','dataset','codes','discussion','basic_info',
                                   'github','twitter','linkedin','website','author_img'])

df.head()

Unnamed: 0,link,last_seen,joined_date_time,followers,following,competitions,dataset,codes,discussion,basic_info,github,twitter,linkedin,website,author_img
0,https://www.kaggle.com/victorsoeiro,in the past day,Thu Mar 19 2020 07:39:17,10,0,0,18,0,2,"Rio de Janeiro, State of Rio de Janeiro, Brazil",https://github.com/victor-soeiro,,https://www.linkedin.com/in/victor-soeiro/,,https://storage.googleapis.com/kaggle-avatars/...
1,https://www.kaggle.com/surajjha101,in the past day,Sun Feb 06 2022 12:26:46,182,10,0,10,3,1226,"New Delhi, Delhi, India",,https://twitter.com/surajjha10101,,,https://storage.googleapis.com/kaggle-avatars/...
2,https://www.kaggle.com/devansodariya,in the past day,Sun Apr 11 2021 12:21:20,16,0,1,3,4,15,"Ahmedabad, Gujarat, India",https://github.com/Dev228-afk,,https://www.linkedin.com/in/dev-ansodariya-b61...,,https://storage.googleapis.com/kaggle-avatars/...
3,https://www.kaggle.com/paradisejoy,a day ago,Tue Apr 02 2019 22:36:08,1,0,0,1,0,1,,,,,,https://storage.googleapis.com/kaggle-avatars/...
4,https://www.kaggle.com/odins0n,in the past day,Tue Feb 09 2021 23:21:40,410,71,32,16,37,996,"Pune, Maharashtra, India",https://github.com/sanskar-hasija,https://twitter.com/Sanskar_Hasija,https://www.linkedin.com/in/sanskar-hasija/,,https://storage.googleapis.com/kaggle-avatars/...


### 8. Dealing with Null Social Links

In [12]:
df['github']    = df['github'].fillna('Unavailable')
df['twitter']   = df['twitter'].fillna('Unavailable')
df['linkedin']  = df['linkedin'].fillna('Unavailable')
df['website']   = df['website'].fillna('Unavailable')

df.head()

Unnamed: 0,link,last_seen,joined_date_time,followers,following,competitions,dataset,codes,discussion,basic_info,github,twitter,linkedin,website,author_img
0,https://www.kaggle.com/victorsoeiro,in the past day,Thu Mar 19 2020 07:39:17,10,0,0,18,0,2,"Rio de Janeiro, State of Rio de Janeiro, Brazil",https://github.com/victor-soeiro,Unavailable,https://www.linkedin.com/in/victor-soeiro/,Unavailable,https://storage.googleapis.com/kaggle-avatars/...
1,https://www.kaggle.com/surajjha101,in the past day,Sun Feb 06 2022 12:26:46,182,10,0,10,3,1226,"New Delhi, Delhi, India",Unavailable,https://twitter.com/surajjha10101,Unavailable,Unavailable,https://storage.googleapis.com/kaggle-avatars/...
2,https://www.kaggle.com/devansodariya,in the past day,Sun Apr 11 2021 12:21:20,16,0,1,3,4,15,"Ahmedabad, Gujarat, India",https://github.com/Dev228-afk,Unavailable,https://www.linkedin.com/in/dev-ansodariya-b61...,Unavailable,https://storage.googleapis.com/kaggle-avatars/...
3,https://www.kaggle.com/paradisejoy,a day ago,Tue Apr 02 2019 22:36:08,1,0,0,1,0,1,,Unavailable,Unavailable,Unavailable,Unavailable,https://storage.googleapis.com/kaggle-avatars/...
4,https://www.kaggle.com/odins0n,in the past day,Tue Feb 09 2021 23:21:40,410,71,32,16,37,996,"Pune, Maharashtra, India",https://github.com/sanskar-hasija,https://twitter.com/Sanskar_Hasija,https://www.linkedin.com/in/sanskar-hasija/,Unavailable,https://storage.googleapis.com/kaggle-avatars/...


### 9. Checking Null Values

In [15]:
df.isnull().sum()

link                0
last_seen           0
joined_date_time    0
followers           0
following           0
competitions        0
dataset             0
codes               0
discussion          0
basic_info          0
github              0
twitter             0
linkedin            0
website             0
author_img          0
dtype: int64

### 10. Dealing with Missing data in basic-Info

In [21]:
basic_info = []

for i in df['basic_info']:
    
    if(i == ''):
        basic_info.append('Unavailable')
        
    else:
        basic_info.append(i)
        
df['basic_info'] = basic_info

df.head()

Unnamed: 0,link,last_seen,joined_date_time,followers,following,competitions,dataset,codes,discussion,basic_info,github,twitter,linkedin,website,author_img
0,https://www.kaggle.com/victorsoeiro,in the past day,Thu Mar 19 2020 07:39:17,10,0,0,18,0,2,"Rio de Janeiro, State of Rio de Janeiro, Brazil",https://github.com/victor-soeiro,Unavailable,https://www.linkedin.com/in/victor-soeiro/,Unavailable,https://storage.googleapis.com/kaggle-avatars/...
1,https://www.kaggle.com/surajjha101,in the past day,Sun Feb 06 2022 12:26:46,182,10,0,10,3,1226,"New Delhi, Delhi, India",Unavailable,https://twitter.com/surajjha10101,Unavailable,Unavailable,https://storage.googleapis.com/kaggle-avatars/...
2,https://www.kaggle.com/devansodariya,in the past day,Sun Apr 11 2021 12:21:20,16,0,1,3,4,15,"Ahmedabad, Gujarat, India",https://github.com/Dev228-afk,Unavailable,https://www.linkedin.com/in/dev-ansodariya-b61...,Unavailable,https://storage.googleapis.com/kaggle-avatars/...
3,https://www.kaggle.com/paradisejoy,a day ago,Tue Apr 02 2019 22:36:08,1,0,0,1,0,1,Unavailable,Unavailable,Unavailable,Unavailable,Unavailable,https://storage.googleapis.com/kaggle-avatars/...
4,https://www.kaggle.com/odins0n,in the past day,Tue Feb 09 2021 23:21:40,410,71,32,16,37,996,"Pune, Maharashtra, India",https://github.com/sanskar-hasija,https://twitter.com/Sanskar_Hasija,https://www.linkedin.com/in/sanskar-hasija/,Unavailable,https://storage.googleapis.com/kaggle-avatars/...


### 11. Saving the Dataset

In [22]:
df.to_csv('users.csv', index = False)