In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from retrying import retry
from requests.exceptions import RequestException, ConnectTimeout

# Function to fetch data from a page with retry decorator
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000, stop_max_attempt_number=3, retry_on_exception=lambda exc: isinstance(exc, (ConnectTimeout, RequestException)))

def fetch_page(url):
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    return response

url = "https://maharerait.mahaonline.gov.in/SearchList/Search"
search_data = {
    '__RequestVerificationToken': '-8sf6SeyGSCPB-ZGIlz43EFH7Q5v83OunCAqJdQmP-esfJDKqOZ98hcjqvK_mzO9-t32MbTJm8W_wBRd8HNp5-sSh5dbIID_QNSm1bYZZTM1',
    'Type': 'Agent',
    'AgentName': 'abhi',
    'pageTraverse': '1'
}

# Increase the timeout to a higher value (e.g., 30 seconds)
timeout_value = 30

try:
    response = requests.post(url, data=search_data, timeout=timeout_value)
    response.raise_for_status()
except requests.exceptions.Timeout:
    print("Timeout occurred. The server took too long to respond.")
    exit(1)
except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")
    exit(1)
soup = BeautifulSoup(response.text, 'html.parser')
data = soup.find_all('tr', class_='grid-row')

# Lists to store data
broker_names = []
certificate_numbers = []
criminal_records = []
house_numbers = []
building_names = []
pin_codes = []

# Extract the data from the first page
for item in data:
    broker_name = item.find('td', class_='grid-cell', attrs={'data-name': 'Name'}).text.strip()
    certificate_number = item.find('td', class_='grid-cell', attrs={'data-name': 'CertiNo'}).text.strip()
    
    # Append data to the lists
    broker_names.append(broker_name)
    certificate_numbers.append(certificate_number)

    link = item.find('a')['href']
    full_url = f"https://maharerait.mahaonline.gov.in{link}"
    response_new = requests.get(full_url)
    new_soup = BeautifulSoup(response_new.text, 'html.parser')

    div_house_number = new_soup.find('label', string='House Number')
    if div_house_number:
        house_number = div_house_number.find_next('div').text.strip()
        house_numbers.append(house_number)
    else:
        house_numbers.append("House number not found.")
        
    div_building_name = new_soup.find('label', string='Building Name')
    if div_building_name:
        building_name = div_building_name.find_next('div').text.strip()
        building_names.append(building_name)
    else:
        building_names.append("Building name not found.")
    
    div_pin_code = new_soup.find('label', string='Pin Code')
    if div_pin_code:
        pin_code = div_pin_code.find_next('div').text.strip()
        pin_codes.append(pin_code)
    else:
        pin_codes.append("Pin code not found.")
    
    div_with_label_criminal = new_soup.select_one('label.mandatory[for="PersonalInfoModel_IsCriminal"]')
    if div_with_label_criminal:
        criminal_record = div_with_label_criminal.find_next('div').text.strip()
        criminal_records.append(criminal_record)
    else:
        criminal_records.append("Criminal record not found.")

# Create a DataFrame to store the data from the first page
result_df1 = pd.DataFrame({
    'Broker Name': broker_names,
    'Certificate Number': certificate_numbers,
    'Criminal Record': criminal_records,
    'House Number': house_numbers,
    'Building Name': building_names,
    'Pin Code': pin_codes
})

# Reset the index to start from one
result_df1.index = range(1, len(result_df1) + 1)

# Print the DataFrame for the first page
print(result_df1)

# Function to fetch data from the second page with retry decorator
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000, stop_max_attempt_number=3)
def fetch_second_page(url):
    response = requests.post(url, data=search_data, timeout=timeout_value)
    response.raise_for_status()
    return response

# Second page data retrieval
# Lists to store data
localities = []
landmarks = []
states = []
divisions = []
districts = []
talukas = []
villages = []

try:
    response = fetch_second_page(url)
except requests.exceptions.Timeout:
    print("Timeout occurred. The server took too long to respond.")
    exit(1)
except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")
    exit(1)

soup = BeautifulSoup(response.text, 'html.parser')
data = soup.find_all('tr', class_='grid-row')

# Iterate through each item (link to individual pages) in data
for item in data:
    link = item.find('a')['href']
    full_url = f"https://maharerait.mahaonline.gov.in{link}"
    
    try:
        response_new = fetch_page(full_url)
        new_soup = BeautifulSoup(response_new.text, 'html.parser')
        rows = new_soup.find_all('div', class_='row')
        locality = None
        landmark = None
        state = None
        division = None
        district = None
        taluka = None
        village = None

        for row in rows:
            labels = row.find_all('label')
            for label in labels:
                if label.get_text(strip=True) == 'Locality':
                    locality = label.find_next('div').get_text(strip=True)
                elif label.get_text(strip=True) == 'Landmark':
                    landmark = label.find_next('div').get_text(strip=True)
                elif label.get_text(strip=True) == 'State/UT':
                    state = label.find_next('div').get_text(strip=True)
                elif label.get_text(strip=True) == 'Division':
                    division = label.find_next('div').get_text(strip=True)
                elif label.get_text(strip=True) == 'District':
                    district = label.find_next('div').get_text(strip=True)
                elif label.get_text(strip=True) == 'Taluka':
                    taluka = label.find_next('div').get_text(strip=True)
                elif label.get_text(strip=True) == 'Village':
                    village = label.find_next('div').get_text(strip=True)  # Corrected variable name
                    
        # Append the extracted data to respective lists
        localities.append(locality)
        landmarks.append(landmark)
        states.append(state)
        divisions.append(division)
        districts.append(district)
        talukas.append(taluka)
        villages.append(village)  # Corrected variable name
        
    except requests.exceptions.RequestException as e:
        print(f"Error accessing page: {full_url}")
        print(e)
        continue  # Skip this page and continue with the next one

# Create a DataFrame to store the data from the second page
result_df2 = pd.DataFrame({
    'Locality': localities,
    'Landmark': landmarks,
    'State/UT': states,
    'Division': divisions,
    'District': districts,
    'Taluka': talukas,
    'Village': villages
})

# Reset the index to start from one
result_df2.index = range(1, len(result_df2) + 1)

# Print the DataFrame for the second page
#print(result_df2)

# Concatenate the data from both pages into a single dataframe
final_result = pd.concat([result_df1, result_df2], axis=1)

# Print the combined DataFrame
print(final_result)


                        Broker Name Certificate Number  \
1               Abhishek  Kandalkar       A51800013819   
2        ABHIMANYU MAROTRAO TAYWADE       A50300006147   
3          Abhijeet Laxman Pardeshi       A52100005879   
4          ABHIJIT SANTOSH JOPULKAR       A51600018559   
5              ABHIJIT RAMDAS BHURE       A51000037945   
6           ABHISHEK KISHOR BIDAWAT       A51600018797   
7          Abhijeet Shrikant Mankar       A50100038196   
8         ABHISHEK SOMNATH BIRAJDAR       A52500031627   
9              Abhijeet P Manathkar       A51100030603   
10  ABHIDI REALTY ADVISORS PVT.LTD.       A51900002423   
11         ABHIJEET BHAURAO GAIKWAD       A51500035292   
12            ABHIJIT DEORAOJI KALE       A50500021021   
13                ABHIJIT  MUKERJEE       A15700022618   
14       ABHIDI RETAIL ADVISORS LLP       A51900002874   
15           Abhijit Ravasaheb Hude       A52400042585   
16       ABHILASH CHANDRAKANT AGWAN       A52300043873   
17            

In [3]:
final_result

Unnamed: 0,Broker Name,Certificate Number,Criminal Record,House Number,Building Name,Pin Code,Locality,Landmark,State/UT,Division,District,Taluka,Village
1,Abhishek Kandalkar,A51800013819,No,401,Vikram Apartments,400049,Juhu,Near Prithvi Theatre,MAHARASHTRA,Konkan,Mumbai Suburban,Andheri,Andheri
2,ABHIMANYU MAROTRAO TAYWADE,A50300006147,No,B23,TAYWADE HOUSE,444603,ARJUN NAGAR,NEAR BY HANUMAN TEMPLE,MAHARASHTRA,Amravati,Amravati,Amravati,Amravati (M Corp.)
3,Abhijeet Laxman Pardeshi,A52100005879,No,10,Kapil Apartments,411038,"Paud road, Kothrud,",Opp Vanaz Company,MAHARASHTRA,Pune,Pune,Haveli,Kothrud
4,ABHIJIT SANTOSH JOPULKAR,A51600018559,No,VENU SADAN OM NAGAR BEHIND SAATBHAI NAGAR,JAIL ROAD,422101,NASHIK,NEAR ADHAAV CHAWL,MAHARASHTRA,Nashik,Nashik,Nashik,Nashik Road
5,ABHIJIT RAMDAS BHURE,A51000037945,No,69,KUNDAN NAGAR,445302,KELAPUR,HERO SHOWROOM,MAHARASHTRA,Amravati,Yavatmal,Kelapur,
6,ABHISHEK KISHOR BIDAWAT,A51600018797,No,OFFICE NO 02 6TH FLOOR,SIDDHI POOJA BUSINESS SQUARE,422002,SHARANPUR,OPP SONY PAITHANI SADI,MAHARASHTRA,Nashik,Nashik,Nashik,Nashik (M Corp.)
7,Abhijeet Shrikant Mankar,A50100038196,No,Near Water Tank,Mahajani Plot,444005,Mahajani Plot,Mahajani Plot,MAHARASHTRA,Amravati,Akola,Akola,Akola (M Corp.)
8,ABHISHEK SOMNATH BIRAJDAR,A52500031627,No,C/O Somnath,Yashawant Nagar,413605,Murum (Rural),Omerga,MAHARASHTRA,Aurangabad,Osmanabad,Umarga,Murum (Rural)
9,Abhijeet P Manathkar,A51100030603,No,1-7-478,Ashirwad,431602,Near Pawdewadi Naka,Opp.Tirupati Appartment,MAHARASHTRA,Aurangabad,Nanded,Nanded,
10,ABHIDI REALTY ADVISORS PVT.LTD.,A51900002423,Criminal record not found.,House number not found.,BARODAWALA MANSION,400018,WORLI,,MAHARASHTRA,Konkan,Mumbai City,Mumbai City,Mumbai City


In [4]:
file_path = 'C:\\Users\\abhic\\OneDrive\\Desktop\\New foder\\project\\1project.csv'
final_result.to_csv(file_path, index=False)