In [1]:
import re
import pandas as pd
import requests
import os
import subprocess
import ipaddress
import shutil
import gzip
import numpy as np

In [2]:
def extract_ip(combined_value):
    combined_value = combined_value.replace(" ", "")
    #extract the ip from the string using regex
    ip_match = re.findall(r'\[?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', combined_value)
    if len(ip_match) > 0:
        return ip_match[0]
    return 'null'

def is_private_ip(ip): #check if an ip is private
    try:
        ip_obj = ipaddress.ip_address(ip)
        return ip_obj.is_private
    except ValueError:
        return False

# Using ip range we inferred whether the organization is Regional or Global
def findtype(ip_range):
    if pd.notna(ip_range):
        num = re.findall(r'\/(.*)', ip_range)
        num = int(num[0])
        if 32 - num <= 8:       #is masking <= 8 then consider it as regional
             return "Regional"
        else:                   #else global
             return "Global"
    else :
        return np.nan
    
def add_type(csv_file):

    df =  pd.read_csv(csv_file)
#     print(df)
    df['IP_Type'] = df['IP_range'].apply(findtype)
    
    return df
    

In [3]:
def get_ip_info(ip_address):
    url = f"https://api.bgpview.io/ip/{ip_address}"
    
    response = requests.get(url)
    data = response.json()
    if len(data['data']['prefixes']) == 0:
           return {
                "IP_range": "null",
                "IP_Name":"null",
                "IP_description":"null",
                "IP_Country":"null",
                "ASN_Number": "null",
                "ASN_description":"null",
                "ASN_Name": "null",
                "ASN_Country_Code": "null"
            }  
    i = 0
    for num in range(1,len(data['data']['prefixes'])):
        if(data['data']['prefixes'][num]['cidr'] < data['data']['prefixes'][i]['cidr']):
            i = num
            
    ip_info = {
        "IP_range": data['data']['prefixes'][i]['prefix'],
        "IP_Name":data['data']['prefixes'][i]['name'],
        "IP_description":data['data']['prefixes'][i]['description'],
        "IP_Country":data['data']['prefixes'][i]['country_code'],
        "ASN_Number": data['data']['prefixes'][i]['asn']['asn'],
        "ASN_description":data['data']['prefixes'][i]['asn']['description'],
        "ASN_Name": data['data']['prefixes'][i]['asn']['name'],
        "ASN_Country_Code": data['data']['prefixes'][i]['asn']['country_code']
    }
    return ip_info

In [4]:
get_ip_info('104.71.158.210')

{'IP_range': '104.64.0.0/10',
 'IP_Name': 'AKAMAI',
 'IP_description': 'Akamai Technologies, Inc.',
 'IP_Country': 'US',
 'ASN_Number': 20940,
 'ASN_description': 'Akamai International B.V.',
 'ASN_Name': 'AKAMAI-ASN1',
 'ASN_Country_Code': 'NL'}

In [5]:
def process_single_file(file_path):
    df = pd.read_csv(file_path, sep='\s{2,}', header=None, usecols=[0, 1, 2, 3, 4], engine='python')
    df = df.iloc[:, 1:].reset_index(drop=True)
    df.iloc[:, 0:3].replace('*', 'null', inplace=True)
    df['ip'] = df.iloc[:, 3].apply(extract_ip)
    df = df[df['ip'] != 'null'] #removing request timed out ips
    df = df[[1,2,3,'ip']]
    ip_info_list = []
    for ip_address in df['ip']:
        if not is_private_ip(ip_address):
            ip_data = get_ip_info(ip_address)
            ip_info_list.append(ip_data)
        else:
            # Append with attributes containing null values
            ip_info_list.append({
                "IP_range": "null",
                "IP_Name":"null",
                "IP_description":"null",
                "IP_Country":"null",
                "ASN_Number": "null",
                "ASN_description":"null",
                "ASN_Name": "null",
                "ASN_Country_Code": "null"
            })
    
    ip_info_df = pd.DataFrame(ip_info_list)
    df = df.reset_index()
    final_df = pd.concat([df, ip_info_df], axis=1)
    return final_df

In [6]:
destinations = [
    'BBC_UK',
    'Global_Suzuki',
    'Google',
    'Hyundai',
    'Rakuten',
    'ISRO',
    'MICROSOFT',
    'OpenAI',
    'Zoom',
    'HDFC'
]

In [None]:
# This piece of code will extract info from .txt file and then create .csv file 
for destination in destinations :
    input_files = [] 
    current_directory = os.getcwd()
# List all files in the folder
    file_names = os.listdir(current_directory + f'/{destination}_txt')
    
    for file_name in file_names:
         input_files.append(file_name)

    print(input_files)

    for file in input_files:
        if os.path.splitext(file)[1] == '.txt':
            temp = process_single_file(current_directory + f'/{destination}_txt/' + file)
            name = os.path.splitext(file)[0] #split file name and extention
            os.makedirs(f'{destination}',exist_ok=True)
            output_file_path = os.path.join(current_directory + f'/{destination}/',f"{name}.csv")
            temp.to_csv(output_file_path,index=False)
    
    print("Completed successfully")

['USVPN_www_bbc_co_uk.txt', 'hotspot_www_bbc_co_uk.txt', 'NetherLandsVPN_www_bbc_co_uk.txt', 'localwifi_www_bbc_co_uk.txt', 'JapanVPN_www_bbc_co_uk.txt']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[:, 0:3].replace('*', 'null', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[:, 0:3].replace('*', 'null', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[:, 0:3].replace('*', 'null', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[:, 0:3]

Completed successfully
['hotspot_www_globalsuzuki_com.txt', 'USVPN_www_globalsuzuki_com.txt', 'JapanVPN_www_globalsuzuki_com.txt', 'NetherLandsVPN_www_globalsuzuki_com.txt', 'localwifi_www_globalsuzuki_com.txt']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[:, 0:3].replace('*', 'null', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[:, 0:3].replace('*', 'null', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[:, 0:3].replace('*', 'null', inplace=True)


In [22]:
# This piece of code will rename the csv files

# List of folder names
folder_names = destinations

for folder_name in folder_names:
    folder_path = os.path.join(os.getcwd(), folder_name)
    
    if os.path.exists(folder_path):
        files = os.listdir(folder_path)
        
        for file_name in files:
            source, _ = file_name.split('_', 1)  # Splitting the filename at the first underscore
            
            old_file_path = os.path.join(folder_path, file_name)
            new_file_name = source + os.path.splitext(file_name)[1]  # Maintain the file extension
            new_file_path = os.path.join(folder_path, new_file_name)
            
            os.rename(old_file_path, new_file_path)
            
            print(f"Renamed {file_name} to {new_file_name} in {folder_name}")
    else:
        print(f"Folder {folder_name} does not exist.")


Renamed NetherLandsVPN_zoom_us.csv to NetherLandsVPN.csv in Zoom
Renamed JapanVPN_zoom_us.csv to JapanVPN.csv in Zoom
Renamed localwifi_zoom_us.csv to localwifi.csv in Zoom
Renamed USVPN_zoom_us.csv to USVPN.csv in Zoom
Renamed hotspot_zoom_us.csv to hotspot.csv in Zoom


In [19]:
# This part of code will add ip_type col to the csv files

# Iterate through destination folder names
for destination in destinations:
    input_files = []
    current_directory = os.getcwd()
    
    # List all files in the destination folder
    folder_path = os.path.join(current_directory, destination)
    file_names = os.listdir(folder_path)
    
    for file_name in file_names:
        input_files.append(file_name)
    
    print(input_files)
    
    for file in input_files:
        if os.path.splitext(file)[1] == '.csv':
            print(os.path.join(current_directory, destination, file))
            temp = add_type(os.path.join(current_directory, destination, file))
            
            name = os.path.splitext(file)[0]  # split file name and extension
            
            os.makedirs(f'{destination}2', exist_ok=True)
            
            output_file_path = os.path.join(current_directory, f'{destination}2', f"{name}.csv")
            
            temp.to_csv(output_file_path, index=False)

print("Processing completed.")

['NetherLandsVPN.xlsx', 'USVPN.csv', 'JapanVPN.xlsx', '.~lock.JapanVPN.csv#', 'NetherLandsVPN.csv', 'hotspot.csv', 'localwifi.csv', 'USVPN.xlsx', 'JapanVPN.csv']
/home/charanubuntu/Computer Networks/BBC_UK/USVPN.csv
   index       1       2       3               ip          IP_range  \
0      0  309 ms  297 ms  294 ms    37.19.199.156    37.19.198.0/23   
1      1  284 ms  285 ms  282 ms  185.229.188.174  185.229.188.0/23   
2      3  326 ms  352 ms  351 ms     154.54.3.125     154.48.0.0/12   
3      4  377 ms  304 ms  326 ms     154.54.90.58     154.48.0.0/12   
4      5  344 ms  310 ms  295 ms    38.104.44.122        38.0.0.0/8   
5      6  306 ms  296 ms  298 ms   151.101.208.81    151.101.0.0/16   

                IP_Name                      IP_description IP_Country  \
0            CDNEXT-NYC                          CDNEXT NYC         US   
1  UK-DATACAMP-20171102                    Datacamp Limited         GB   
2                NET154  African Network Information Center     

/home/charanubuntu/Computer Networks/Rakuten/NetherLandsVPN.csv
    index       1       2       3              ip         IP_range  \
0       0  163 ms  184 ms     NaN  185.107.56.254  185.107.56.0/22   
1       1  219 ms  232 ms  298 ms    81.20.64.113    81.20.64.0/20   
2       2  318 ms  305 ms  303 ms    129.250.7.86   129.250.0.0/16   
3       4  382 ms  245 ms  236 ms   129.250.6.147   129.250.0.0/16   
4       5  415 ms  616 ms  409 ms   129.250.6.177   129.250.0.0/16   
5       6  437 ms     NaN     NaN   129.250.4.143   129.250.0.0/16   
6       7  501 ms  494 ms  589 ms   129.250.6.127   129.250.0.0/16   
7       8  601 ms  507 ms  411 ms  203.105.72.218  203.105.64.0/19   
8       9  406 ms  401 ms  452 ms   202.72.49.107   202.72.48.0/20   
9      10  447 ms  446 ms  438 ms   202.72.48.166   202.72.48.0/20   
10     11  456 ms  428 ms  408 ms   202.72.48.171   202.72.48.0/20   
11     12  817 ms  557 ms  468 ms  133.237.16.234   133.237.0.0/16   

                IP_Name  

/home/charanubuntu/Computer Networks/OpenAI/JapanVPN.csv
   index       1       2       3               ip          IP_range  \
0      0  134 ms  165 ms  234 ms    103.125.235.1  103.125.235.0/24   
1      1  138 ms  133 ms     NaN     91.200.240.8   91.200.240.0/24   
2      2  132 ms  137 ms  129 ms     211.14.4.177     211.14.0.0/21   
3      3  127 ms  134 ms  127 ms     211.14.4.178     211.14.0.0/21   
4      5  140 ms  136 ms  136 ms  210.171.224.110               NaN   
5      6     NaN  132 ms  162 ms   104.44.235.188     104.40.0.0/13   
6      7     NaN  148 ms  144 ms   104.44.236.182     104.40.0.0/13   
7      8  134 ms  137 ms  133 ms   13.104.182.215     13.104.0.0/14   
8     11  133 ms  128 ms  132 ms    13.107.246.46     13.104.0.0/14   

             IP_Name                    IP_description IP_Country  ASN_Number  \
0  ProtonVPN-Tokyo-1                         ProtonVPN         JP      3258.0   
1                NaN                               NaN         DE     

In [4]:
# This code will compress all the folders 

# List of folder names

folder_names = destinations 

# Create a directory to store the compressed files
output_folder = 'compressed_folders2'
os.makedirs(output_folder, exist_ok=True)

for folder_name in folder_names:
    folder_path = folder_name + '2'
    archive_name = folder_name + '.zip'  # Name of the archive
    archive_path = os.path.join(output_folder, archive_name)  # Path to the output archive
    
    shutil.make_archive(os.path.splitext(archive_path)[0], 'zip', folder_path)
    
    print(f"Compressed {folder_name} to {archive_name} and moved to {output_folder}")



   index       1       2       3               ip          IP_range  \
0      0  309 ms  297 ms  294 ms    37.19.199.156    37.19.198.0/23   
1      1  284 ms  285 ms  282 ms  185.229.188.174  185.229.188.0/23   
2      3  326 ms  352 ms  351 ms     154.54.3.125     154.48.0.0/12   
3      4  377 ms  304 ms  326 ms     154.54.90.58     154.48.0.0/12   
4      5  344 ms  310 ms  295 ms    38.104.44.122        38.0.0.0/8   
5      6  306 ms  296 ms  298 ms   151.101.208.81    151.101.0.0/16   

                IP_Name                      IP_description IP_Country  \
0            CDNEXT-NYC                          CDNEXT NYC         US   
1  UK-DATACAMP-20171102                    Datacamp Limited         GB   
2                NET154  African Network Information Center         MU   
3                NET154  African Network Information Center         MU   
4                   NaN                              Fastly         US   
5               SKYCA-3                              Fastl