## Problem Statement - 24

### Perform the following operations using Python on ForestFires Dataset.
### a. Create data subsets by making classes for amount of region affected.(e.g. NotAffected, Partially affected, Mostlyaffected).
### b. Merge two subsets
### c. Sort Data using Temperature, wind and area.
### d. Transposing Data
### e. Melting Data to long format
### f. Casting data to wide format

In [1]:
import pandas as pd 

In [2]:
# Load the dataset
df = pd.read_csv('forestfires.csv', header=None)

# Add appropriate column names (based on UCI Forest Fires Dataset)
df.columns = [
    'X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI',
    'temp', 'RH', 'wind', 'rain', 'area'
]

df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
1,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
2,7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0
3,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0,0
4,8,6,mar,fri,91.7,33.3,77.5,9,8.3,97,4,0.2,0


In [3]:
# Convert 'area' to numeric 
df['area'] = pd.to_numeric(df['area'], errors='coerce')

###  Create data subsets by making classes for amount of region affected.(e.g. NotAffected, Partially affected, Mostlyaffected). 

In [4]:
# Create custom category
def classify_area(area):
    if area == 0:
        return "NotAffected"
    elif area <= 10:
        return "PartiallyAffected"
    else:
        return "MostlyAffected"

df['RegionImpact'] = df['area'].apply(classify_area)

# Subsets
not_affected = df[df['RegionImpact'] == 'NotAffected']
partially_affected = df[df['RegionImpact'] == 'PartiallyAffected']
mostly_affected = df[df['RegionImpact'] == 'MostlyAffected']

In [5]:
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,RegionImpact
0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,,MostlyAffected
1,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0.0,NotAffected
2,7,4,oct,tue,90.6,35.4,669.1,6.7,18,33,0.9,0,0.0,NotAffected
3,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0,0.0,NotAffected
4,8,6,mar,fri,91.7,33.3,77.5,9,8.3,97,4,0.2,0.0,NotAffected


### Merge two subsets

In [6]:
merged_df = pd.concat([partially_affected, mostly_affected], ignore_index=True)
merged_df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,RegionImpact
0,9,9,jul,tue,85.8,48.3,313.4,3.9,18.0,42,2.7,0,0.36,PartiallyAffected
1,1,4,sep,tue,91.0,129.5,692.6,7.0,21.7,38,2.2,0,0.43,PartiallyAffected
2,2,5,sep,mon,90.9,126.5,686.5,7.0,21.9,39,1.8,0,0.47,PartiallyAffected
3,1,2,aug,wed,95.5,99.9,513.3,13.2,23.3,31,4.5,0,0.55,PartiallyAffected
4,8,6,aug,fri,90.1,108.0,529.8,12.5,21.2,51,8.9,0,0.61,PartiallyAffected


### Sort Data using Temperature, wind and area.

In [7]:
sorted_df = df.sort_values(by=['temp', 'wind', 'area'], ascending=[True, True, True])
sorted_df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,RegionImpact
396,4,5,feb,sun,85.0,9.0,56.9,3.5,10.1,62,1.8,0,51.78,MostlyAffected
348,5,4,sep,fri,92.1,99.0,745.3,9.6,10.1,75,3.6,0,0.0,NotAffected
351,5,4,sep,fri,92.1,99.0,745.3,9.6,10.1,75,3.6,0,3.71,PartiallyAffected
467,6,5,mar,mon,87.2,15.1,36.9,7.1,10.2,45,5.8,0,3.18,PartiallyAffected
121,3,4,aug,mon,91.5,145.4,608.2,10.7,10.3,74,2.2,0,0.0,NotAffected


### Transposing Data

In [8]:
# Transposing means converting rows to columns and columns to rows.

transposed_df = df.transpose() #transposing the data
print("\nTransposed Data:")
transposed_df.head()


Transposed Data:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,508,509,510,511,512,513,514,515,516,517
X,X,7,7,7,8,8,8,8,8,8,...,2,1,5,6,8,4,2,7,1,6
Y,Y,5,4,4,6,6,6,6,6,6,...,4,2,4,5,6,3,4,4,4,3
month,month,mar,oct,oct,mar,mar,aug,aug,aug,sep,...,aug,aug,aug,aug,aug,aug,aug,aug,aug,nov
day,day,fri,tue,sat,fri,sun,sun,mon,mon,tue,...,fri,fri,fri,fri,sun,sun,sun,sun,sat,tue
FFMC,FFMC,86.2,90.6,90.6,91.7,89.3,92.3,92.3,91.5,91,...,91,91,91,91,81.6,81.6,81.6,81.6,94.4,79.5


### Melting Data to long format

In [9]:
# Melting is the process of turning wide-format data (where each variable has its own column) into long-format data 
# (where there are only two or three columns: one for the variable name and one for the value).

# Melt selected columns
melted_df = pd.melt(df, id_vars=['month', 'day'], value_vars=['temp', 'RH', 'wind', 'rain'],
                    var_name='Measurement', value_name='Value')
print("\nMelted Data (Long Format):")
melted_df.head()


Melted Data (Long Format):


Unnamed: 0,month,day,Measurement,Value
0,month,day,temp,temp
1,mar,fri,temp,8.2
2,oct,tue,temp,18
3,oct,sat,temp,14.6
4,mar,fri,temp,8.3


### Casting data to wide format

In [10]:
# Casting involves transforming long-format data into a wide format, where each unique value in one column becomes its own separate column.

melted_df['Value'] = pd.to_numeric(melted_df['Value'], errors='coerce')

# Now pivot (cast) back to wide format
wide_df = melted_df.pivot_table(
    index=['month', 'day'],
    columns='Measurement',
    values='Value',
    aggfunc='mean'  # or you can use 'sum' or another function
).reset_index()

print("\nPivoted (Wide Format):")
wide_df.head()


Pivoted (Wide Format):


Measurement,month,day,RH,rain,temp,wind
0,apr,fri,20.0,0.0,16.7,3.1
1,apr,mon,64.0,0.0,10.9,3.1
2,apr,sat,44.0,0.0,9.3,4.5
3,apr,sun,45.0,0.0,14.9,5.666667
4,apr,thu,54.0,0.0,5.8,5.8
