### Read From Csv


In [143]:
import pandas as pd

file_path = 'datasett.csv'
df = pd.read_csv(file_path)

### Text Cleaning


In [144]:
from rdatapp import TextCleaner

# Create an instance of the TextCleaner class
text_cleaner = TextCleaner()
df['Cleaned_Summary'] = df['Summary'].apply(text_cleaner.clean_text)
print(df[['Summary', 'Cleaned_Summary']].head())


                                             Summary  \
0  A group of college students get more than they...   
1  A documentary that sheds light on the devastat...   
2  A lost civilization is rediscovered deep withi...   
3  A young inventor builds a time machine and emb...   
4  A young inventor builds a time machine and emb...   

                                     Cleaned_Summary  
0  group college student get bargained spend week...  
1  documentary shed light devastating effect clim...  
2  lost civilization rediscovered deep within ama...  
3  young inventor build time machine embarks jour...  
4  young inventor build time machine embarks jour...  


### Missing Value Handling

In [145]:
from rdatapp import MissingValueHandler

# Index 52 is missing in the 'Budget in USD' column 
df = MissingValueHandler.impute_mean(df, 'Budget in USD')
print(df['Budget in USD'].head())
filtered_df = df.loc[40:60, ['Shooting Location', 'Budget in USD']]
print(filtered_df)

0    1.035408e+07
1    5.722105e+06
2    5.601372e+07
3    4.422451e+06
4    1.163290e+08
Name: Budget in USD, dtype: float64
   Shooting Location  Budget in USD
40          New York   5.441482e+07
41            Sydney   8.058753e+07
42            London   2.954130e+07
43       Los Angeles   1.281218e+07
44          New York   4.307335e+07
45       Los Angeles   9.016003e+06
46             Tokyo   7.606288e+07
47           Toronto   4.802288e+07
48             Paris   2.137900e+07
49             Tokyo   1.262986e+07
50       Los Angeles   5.360627e+07
51             Paris   6.789908e+07
52            Sydney   4.763700e+07
53            Sydney   7.047465e+07
54           Toronto   6.101155e+07
55             Paris   5.427362e+06
56             Tokyo   6.504917e+07
57       Los Angeles   5.215293e+07
58            London   8.272745e+07
59             Tokyo   4.763700e+07
60          New York   1.659002e+07


### Encoding


In [146]:
from rdatapp import CategoricalEncoder

# One-hot encode the 'Genre' column
df = CategoricalEncoder.one_hot_encode(df, 'Genre')
encoded_df = df.filter(regex='Genre')
print(encoded_df.head())


   Genre_Action  Genre_Adventure  Genre_Animation  Genre_Comedy  Genre_Crime  \
0           0.0              0.0              0.0           0.0          0.0   
1           0.0              0.0              0.0           0.0          0.0   
2           0.0              1.0              0.0           0.0          0.0   
3           0.0              0.0              0.0           0.0          0.0   
4           0.0              0.0              0.0           0.0          0.0   

   Genre_Documentary  Genre_Drama  Genre_Family  Genre_Fantasy  Genre_Foreign  \
0                0.0          0.0           0.0            0.0            0.0   
1                1.0          0.0           0.0            0.0            0.0   
2                0.0          0.0           0.0            0.0            0.0   
3                0.0          0.0           0.0            1.0            0.0   
4                0.0          0.0           0.0            1.0            0.0   

   Genre_History  Genre_Horror  

### Outlier Detection

In [147]:
from rdatapp import OutlierHandler

# Detecting and removing outliers in the "Rating" column using the IQR method
print(df['Rating'].describe())
df = OutlierHandler.iqr_outlier_detection(df, 'Rating')
print(df['Rating'].describe())


count    1000.000000
mean        8.051919
std         1.768733
min         3.023367
25%         7.470223
50%         8.646978
75%         9.283838
max         9.985402
Name: Rating, dtype: float64
count    913.000000
mean       8.454158
std        1.240696
min        4.771495
25%        7.900184
50%        8.772862
75%        9.357609
max        9.985402
Name: Rating, dtype: float64


### Scaling

In [148]:
from rdatapp import Scaler

# Min-Max scaling the "Rating" column
df = Scaler.min_max_scale(df, 'Rating')
print(df[['Rating']].head())

     Rating
0  0.954885
1  0.602500
3  0.774906
4  0.091965
5  0.748976


### Feature Engineering

In [149]:
from rdatapp.feature_engineer import FeatureEngineer

# Create a new feature 'Budget in USD_new' by dividing the 'Budget in USD' column by 1e6
df = FeatureEngineer.create_new_feature(df, 'Budget in USD', lambda x: x / 1e6)
print(df[['Budget in USD', 'Budget in USD_new']].head())


   Budget in USD  Budget in USD_new
0   1.035408e+07          10.354075
1   5.722105e+06           5.722105
3   4.422451e+06           4.422451
4   1.163290e+08         116.328972
5   2.406163e+07          24.061629


### Date-Time Handling

In [150]:
from rdatapp import DateTimeHandler

# Convert the "Release Date" column to datetime format
df = DateTimeHandler.to_datetime(df, 'Release Date')

# Extract the year, month, and day from the "Release Date" column
df = DateTimeHandler.extract_date_parts(df, 'Release Date')
print(df[['Release Date', 'Release Date_year', 'Release Date_month', 'Release Date_day']].head())


  Release Date  Release Date_year  Release Date_month  Release Date_day
0   1985-07-07             1985.0                 7.0               7.0
1   1999-05-08             1999.0                 5.0               8.0
3          NaT                NaN                 NaN               NaN
4          NaT                NaN                 NaN               NaN
5   2013-04-06             2013.0                 4.0               6.0
