# Bossa Muffin Proxies Filter Program v1 - 24/04/2022 
# With Requests modul

## Import all moduls i need for this program

In [None]:
from typing import Dict
from fake_useragent import UserAgent 
import time
import requests
import concurrent.futures
import csv
import pandas as pd

## Define types

In [None]:
IpDict = Dict[int, str]

## Define const and var

In [None]:
FILE_TO_TEST = "proxies_list.csv"
GOOD_FILE = "good_proxies_list.csv"
URL_TO_TEST_MY_IP = "https://httpbin.org/ip"
IP_DEFAULT = "0.0.0.0"

good_proxies = []
bad_proxies = []

In [None]:
# Add a header giving a random user agent
ua = UserAgent().random
headers = {'user-agent': ua}

In [None]:
# Attend pour éviter le bagne
#time.sleep(360)

## Get my own IP 

In [None]:
def testMyIp(url: str=URL_TO_TEST_MY_IP) -> IpDict:
    
    my_ip_status = {
        'status_code': 0, 
        'ip': IP_DEFAULT
    }
    print(f"\nWe test my ip by : {url}")
    
    try:
        req = requests.get(url)
        my_ip_status['status_code'] = req.status_code
        my_ip_status['ip'] = req.json()['origin']
        print("My ip : ", my_ip_status['ip'])
        req.close()
    
    except:
        print("!My ip : Failed to join the server!")
        pass
    
    return my_ip_status

## Test a proxy IP by HTTP and HTTPS request

In [None]:
def testProxy(proxy_ip: str, proxy_port: str, proxy_time: str, url: str=URL_TO_TEST_MY_IP) -> IpDict:
    
    request_proxy_status = {
        'status_code': 0, 
        'ip': IP_DEFAULT
    }
    proxy_to_test = proxy_ip + ':' + proxy_port
    print(f"\nWe test this Proxy : {proxy_to_test}")
    
    try:
        req = requests.get(url, 
                           proxies={'http': proxy_to_test , 'https': proxy_to_test}, 
                           headers=headers, 
                           timeout=proxy_time
                          )
        r['status_code'] = req.status_code
        r['ip'] = req.json()['origin']
        req.close()
    
    except:
        print(f"\n!Failed to join the proxy : {proxy_ip} on port : {proxy_port} by {url} \n")
        pass
    
    return request_proxy_status

## Verify that the proxy doesn't use my own IP address

In [None]:
# Test the test_proxy result comparing with test_my_ip
def checkProxy(r_my_ip: IpDict, r_the_proxy: IpDict, proxy_ip: str) -> bool:
    
    proxy_is_reachable = False
    
    if r_the_proxy['status_code'] == 200:
        print(f"\nTested proxy's ip : {proxy_ip}")

        if r_the_proxy['ip'] == r_my_ip['ip']:
            print("There is a problem :")
            print(f"Proxy tested ({r_the_proxy['ip']}) returns my ip ({r_my_ip['ip']}) !")
            
        elif r_the_proxy['ip'] != proxy_ip:
            print("There is a problem :")
            print(f"Proxy: IP tested ({r_the_proxy['ip']}) and returned IP ({proxy_ip}) are different !")
            
        elif r_the_proxy['ip'] != IP_DEFAULT:
            proxy_is_reachable = True 
    
    else:
        print(f"\nStatus code of {proxy_ip}: {r_the_proxy['status_code']}")
              
    return proxy_is_reachable

## Test if proxies are listening and separate good and bad proxies

In [None]:
# Test, check and separate good and bad proxies
my_ip = testMyIp()

def extractProxy(proxy_param: dict):
    
    global good_proxies
    global bad_proxies
    the_proxy = testProxy(proxy_param['ip'], proxy_param['port'], proxy_param['timeout'])
    
    if checkProxy(my_ip, the_proxy, proxy_param['ip']):
        good_proxies.append({
            'ip': proxy_param['ip'], 
            'port': proxy_param['port'], 
            'timeout': proxy_param['timeout'], 
            'prot': proxy_param['prot'], 
            'code': the_proxy['status_code']
        })
        
    else:
        bad_proxies.append({
            'ip': proxy_param['ip'], 
            'port': proxy_param['port'], 
            'timeout':proxy_param['timeout'], 
            'prot':proxy_param['prot'], 
            'code': the_proxy['status_code']
        })

## Extract proxies form csv file and map a proxies list to test

In [None]:
def strMsToIntS(str_ms: str)-> int:
    
    int_ms = int(str_ms.replace(" ms", ""))
    int_s = 1.2 * (int_ms // 1000)
    
    return int_s

In [None]:
def extractProxyList(file=FILE_TO_TEST)-> list:
    
    proxy_row_from_csv = []
    
    with open(file, 'r') as f:
        reader_f = csv.reader(f)
        
        for row in reader_f:
            time_s = strMsToIntS(row[3])
            proxy_row_from_csv.append({
                'ip': row[0], 
                'port': row[1], 
                'timeout': time_s, 
                'prot': row[2]
            })
            
    return proxy_row_from_csv

## Test all proxies IP in a pool with asynchronous method 

In [None]:
proxy_list = extractProxyList()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(extractProxy, proxy_list)

## Save good IP proxies in the GOOD_FILE.csv

In [None]:
print(f"Proxy success : {good_proxies}")
print(f"We get : {len(good_proxies)} working proxies")

bdd_results = pd.DataFrame(good_proxies, columns=['ip', 'port', 'timeout', 'prot'])
bdd_results.to_csv(GOOD_FILE)

print(f"Proxy failed : {bad_proxies}")
print(f"{len(bad_proxies)} don't work")