In [3]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Set a random seed for reproducibility
random.seed(42)

# Generate a practical sample dataset
def generate_practical_data(num_rows=500):
    data = []

    for i in range(num_rows):
        start_date = datetime(2020, 1, 1)
        project_duration = random.randint(30, 365)  # Project duration in days
        end_date = start_date + timedelta(days=project_duration)

        land_size = random.uniform(800, 5000)
        num_floors = random.randint(1, 15)
        
        # Establish relationships to ensure practical values
        tons_cement = max(10, min(land_size / 5, num_floors * 100))
        tons_concrete = max(50, min(land_size / 10, num_floors * 200))
        tons_sand = max(20, min(land_size / 20, num_floors * 50))
        
        labor_required = max(5, min(num_floors * 5, 150))

        data.append({
            'Project_ID': f'P-{i + 1}',
            'Land_Size_Sqm': round(land_size,2),
            'Num_Floors': num_floors,
            'Tons_Cement': round(tons_cement, 2),
            'Tons_Concrete': round(tons_concrete, 2),
            'Tons_Sand': round(tons_sand, 2),
            'Labor_Required': labor_required,
            'Start_Date': start_date,
            'End_Date': end_date,
        })

    return data

# Create a DataFrame
dataset = pd.DataFrame(generate_practical_data())
csv_file_path = '/Users/vyakaranamsowmya/Bob-s-Esti-Mate/output.csv'

dataset.to_csv(csv_file_path, index=False)

print(dataset)

    Project_ID  Land_Size_Sqm  Num_Floors  Tons_Cement  Tons_Concrete  \
0          P-1        1267.59          12       253.52         126.76   
1          P-2        1828.55           3       300.00         182.85   
2          P-3        3642.14          15       728.43         364.21   
3          P-4        1165.14           7       233.03         116.51   
4          P-5         925.15           4       185.03          92.51   
..         ...            ...         ...          ...            ...   
495      P-496        1461.12          10       292.22         146.11   
496      P-497        3630.67           5       500.00         363.07   
497      P-498        3580.14           8       716.03         358.01   
498      P-499        2656.59           5       500.00         265.66   
499      P-500        3970.76           2       200.00         397.08   

     Tons_Sand  Labor_Required Start_Date   End_Date  
0        63.38              60 2020-01-01 2020-12-23  
1        91.4

In [4]:
df=pd.read_csv('output.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Project_ID      500 non-null    object 
 1   Land_Size_Sqm   500 non-null    float64
 2   Num_Floors      500 non-null    int64  
 3   Tons_Cement     500 non-null    float64
 4   Tons_Concrete   500 non-null    float64
 5   Tons_Sand       500 non-null    float64
 6   Labor_Required  500 non-null    int64  
 7   Start_Date      500 non-null    object 
 8   End_Date        500 non-null    object 
dtypes: float64(4), int64(2), object(3)
memory usage: 35.3+ KB


In [5]:
df['End_Date'] = pd.to_datetime(df['End_Date'], format="%Y-%m-%d")
df['Start_Date ']=pd.to_datetime(df['Start_Date'],format="%Y-%m-%d")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Project_ID      500 non-null    object        
 1   Land_Size_Sqm   500 non-null    float64       
 2   Num_Floors      500 non-null    int64         
 3   Tons_Cement     500 non-null    float64       
 4   Tons_Concrete   500 non-null    float64       
 5   Tons_Sand       500 non-null    float64       
 6   Labor_Required  500 non-null    int64         
 7   Start_Date      500 non-null    object        
 8   End_Date        500 non-null    datetime64[ns]
 9   Start_Date      500 non-null    datetime64[ns]
dtypes: datetime64[ns](2), float64(4), int64(2), object(2)
memory usage: 39.2+ KB


In [9]:
df['No.of days'] = df['End_Date'] - df['Start_Date ']

In [10]:
df

Unnamed: 0,Project_ID,Land_Size_Sqm,Num_Floors,Tons_Cement,Tons_Concrete,Tons_Sand,Labor_Required,Start_Date,End_Date,Start_Date.1,Duration,No.of days
0,P-1,1267.59,12,253.52,126.76,63.38,60,2020-01-01,2020-12-23,2020-01-01,357 days,357 days
1,P-2,1828.55,3,300.00,182.85,91.43,15,2020-01-01,2020-06-19,2020-01-01,170 days,170 days
2,P-3,3642.14,15,728.43,364.21,182.11,75,2020-01-01,2020-03-23,2020-01-01,82 days,82 days
3,P-4,1165.14,7,233.03,116.51,58.26,35,2020-01-01,2020-11-05,2020-01-01,309 days,309 days
4,P-5,925.15,4,185.03,92.51,46.26,20,2020-01-01,2020-02-16,2020-01-01,46 days,46 days
...,...,...,...,...,...,...,...,...,...,...,...,...
495,P-496,1461.12,10,292.22,146.11,73.06,50,2020-01-01,2020-04-19,2020-01-01,109 days,109 days
496,P-497,3630.67,5,500.00,363.07,181.53,25,2020-01-01,2020-02-25,2020-01-01,55 days,55 days
497,P-498,3580.14,8,716.03,358.01,179.01,40,2020-01-01,2020-09-13,2020-01-01,256 days,256 days
498,P-499,2656.59,5,500.00,265.66,132.83,25,2020-01-01,2020-12-06,2020-01-01,340 days,340 days
