In [26]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Set a random seed for reproducibility
random.seed(42)

# Generate a practical sample dataset
def generate_practical_data(num_rows=500):
    data = []

    for i in range(num_rows):
        start_date = datetime(2020, 1, 1)
        project_duration = random.randint(30, 365)  # Project duration in days
        end_date = start_date + timedelta(days=project_duration)

        land_size = random.uniform(800, 5000)
        num_floors = random.randint(1, 15)
        
        # Establish relationships to ensure practical values
        tons_cement = max(10, min(land_size / 5, num_floors * 100))
        tons_concrete = max(50, min(land_size / 10, num_floors * 200))
        tons_sand = max(20, min(land_size / 20, num_floors * 50))
        
        labor_required = max(5, min(num_floors * 5, 150))

        # Cost per unit for different materials
        cost_per_ton_cement = 8000
        cost_per_ton_sand = 7500
        cost_per_ton_concrete = 6000
        cost_per_brick = 6
        cost_per_ton_iron = 500000

        # Calculate additional columns
        total_budget = (tons_cement * cost_per_ton_cement +
                        tons_sand * cost_per_ton_sand +
                        tons_concrete * cost_per_ton_concrete +
                        labor_required * 5000)  # Assuming labor cost per worker is 5000

        num_bricks = int(land_size / 10)  # Assuming 10 sqm requires 1 brick

        tons_iron = max(0, min(land_size / 100, num_floors * 5))  # Assuming 1 floor requires 5 tons of iron

        total_cost = (num_bricks * cost_per_brick +
                      tons_iron * cost_per_ton_iron +
                      total_budget)

        data.append({
            'Project_ID': f'P-{i + 1}',
            'Land_Size_Sqm': round(land_size, 2),
            'Num_Floors': num_floors,
            'Tons_Cement': round(tons_cement, 2),
            'Tons_Concrete': round(tons_concrete, 2),
            'Tons_Sand': round(tons_sand, 2),
            'Labor_Required': labor_required,
            'Start_Date': start_date,
            'End_Date': end_date,
            'Total_Budget': total_budget,
            'Num_Bricks': num_bricks,
            'Tons_Iron': round(tons_iron, 2),
            'Total_Cost': total_cost,
        })

    return data


dataset = pd.DataFrame(generate_practical_data())

# Specify the complete file path with double quotes to handle the single quote
csv_file_path = 'C:/Users/sriyo/Desktop/Bob\'s Esti-Mate/output.csv'

# Save the DataFrame to the CSV file
dataset.to_csv(csv_file_path, index=False)

# Print the DataFrame
print(dataset)

    Project_ID  Land_Size_Sqm  Num_Floors  Tons_Cement  Tons_Concrete  \
0          P-1        1267.59          12       253.52         126.76   
1          P-2        1828.55           3       300.00         182.85   
2          P-3        3642.14          15       728.43         364.21   
3          P-4        1165.14           7       233.03         116.51   
4          P-5         925.15           4       185.03          92.51   
..         ...            ...         ...          ...            ...   
495      P-496        1461.12          10       292.22         146.11   
496      P-497        3630.67           5       500.00         363.07   
497      P-498        3580.14           8       716.03         358.01   
498      P-499        2656.59           5       500.00         265.66   
499      P-500        3970.76           2       200.00         397.08   

     Tons_Sand  Labor_Required Start_Date   End_Date  Total_Budget  \
0        63.38              60 2020-01-01 2020-12-23 

In [27]:
df=pd.read_csv('output.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Project_ID      500 non-null    object 
 1   Land_Size_Sqm   500 non-null    float64
 2   Num_Floors      500 non-null    int64  
 3   Tons_Cement     500 non-null    float64
 4   Tons_Concrete   500 non-null    float64
 5   Tons_Sand       500 non-null    float64
 6   Labor_Required  500 non-null    int64  
 7   Start_Date      500 non-null    object 
 8   End_Date        500 non-null    object 
 9   Total_Budget    500 non-null    float64
 10  Num_Bricks      500 non-null    int64  
 11  Tons_Iron       500 non-null    float64
 12  Total_Cost      500 non-null    float64
dtypes: float64(7), int64(3), object(3)
memory usage: 50.9+ KB


In [28]:
df['End_Date'] = pd.to_datetime(df['End_Date'], format="%Y-%m-%d")
df['Start_Date']=pd.to_datetime(df['Start_Date'],format="%Y-%m-%d")

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Project_ID      500 non-null    object        
 1   Land_Size_Sqm   500 non-null    float64       
 2   Num_Floors      500 non-null    int64         
 3   Tons_Cement     500 non-null    float64       
 4   Tons_Concrete   500 non-null    float64       
 5   Tons_Sand       500 non-null    float64       
 6   Labor_Required  500 non-null    int64         
 7   Start_Date      500 non-null    datetime64[ns]
 8   End_Date        500 non-null    datetime64[ns]
 9   Total_Budget    500 non-null    float64       
 10  Num_Bricks      500 non-null    int64         
 11  Tons_Iron       500 non-null    float64       
 12  Total_Cost      500 non-null    float64       
dtypes: datetime64[ns](2), float64(7), int64(3), object(1)
memory usage: 50.9+ KB


In [33]:
df['No.of days']=df['End_Date']-df['Start_Date']

In [34]:
df

Unnamed: 0,Project_ID,Land_Size_Sqm,Num_Floors,Tons_Cement,Tons_Concrete,Tons_Sand,Labor_Required,Start_Date,End_Date,Total_Budget,Num_Bricks,Tons_Iron,Total_Cost,No.of days
0,P-1,1267.59,12,253.52,126.76,63.38,60,2020-01-01,2020-12-23,3.564046e+06,126,12.68,9.902754e+06,357 days
1,P-2,1828.55,3,300.00,182.85,91.43,15,2020-01-01,2020-06-19,4.257832e+06,182,15.00,1.175892e+07,170 days
2,P-3,3642.14,15,728.43,364.21,182.11,75,2020-01-01,2020-03-23,9.753505e+06,364,36.42,2.796638e+07,82 days
3,P-4,1165.14,7,233.03,116.51,58.26,35,2020-01-01,2020-11-05,3.175243e+06,116,11.65,9.001655e+06,309 days
4,P-5,925.15,4,185.03,92.51,46.26,20,2020-01-01,2020-02-16,2.482257e+06,92,9.25,7.108551e+06,46 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,P-496,1461.12,10,292.22,146.11,73.06,50,2020-01-01,2020-04-19,4.012388e+06,146,14.61,1.131887e+07,109 days
496,P-497,3630.67,5,500.00,363.07,181.53,25,2020-01-01,2020-02-25,7.664902e+06,363,25.00,2.016708e+07,55 days
497,P-498,3580.14,8,716.03,358.01,179.01,40,2020-01-01,2020-09-13,9.418867e+06,358,35.80,2.732173e+07,256 days
498,P-499,2656.59,5,500.00,265.66,132.83,25,2020-01-01,2020-12-06,6.715171e+06,265,25.00,1.921676e+07,340 days


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype          
---  ------          --------------  -----          
 0   Project_ID      500 non-null    object         
 1   Land_Size_Sqm   500 non-null    float64        
 2   Num_Floors      500 non-null    int64          
 3   Tons_Cement     500 non-null    float64        
 4   Tons_Concrete   500 non-null    float64        
 5   Tons_Sand       500 non-null    float64        
 6   Labor_Required  500 non-null    int64          
 7   Start_Date      500 non-null    datetime64[ns] 
 8   End_Date        500 non-null    datetime64[ns] 
 9   Total_Budget    500 non-null    float64        
 10  Num_Bricks      500 non-null    int64          
 11  Tons_Iron       500 non-null    float64        
 12  Total_Cost      500 non-null    float64        
 13  No.of days      500 non-null    timedelta64[ns]
dtypes: datetime64[ns](2), float64(7), int64(3)

In [36]:
df = df.drop('Project_ID', axis=1)

In [40]:
x=df[['Land_Size_Sqm','Num_Floors']]
y=

In [None]:
x=df[]

In [None]:
from sklearn.model_selection import train_test_split
xP_train_x_test,y_train,y_test=train_test_split()

In [None]:
import xgboost as xgb
model = xgb.XGBRegressor()