In [1]:
import pandas as pd
import numpy as np

In [23]:
from scipy.stats import pearsonr,spearmanr,kruskal,pointbiserialr
from sklearn.feature_selection import mutual_info_regression,f_regression

In [2]:
booknow_theaters = pd.read_csv('booknow_theaters.csv')
booknow_booking = pd.read_csv('booknow_booking.csv')
cinepos_theater = pd.read_csv('cinepos_theaters.csv')
cinepos_booking = pd.read_csv('cinepos_booking.csv')
booknow_visits = pd.read_csv('booknow_visits.csv')
date_info = pd.read_csv('date_info.csv')
relation_id = pd.read_csv('relation_id.csv')
sample_sub = pd.read_csv('sample_sub.csv')

In [4]:

booknow_booking['show_date'] = pd.to_datetime(booknow_booking['show_datetime'].apply(lambda x:x[:10]))
cinepos_booking['show_date'] = pd.to_datetime(cinepos_booking['show_datetime'].apply(lambda x:x[:10]))        
booknow_visits['show_date'] = pd.to_datetime(booknow_visits['show_date'])

booknow_visits['week_day'] = booknow_visits['show_date'].apply(lambda x:x.dayofweek)
booknow_visits['month'] = booknow_visits['show_date'].apply(lambda x:x.month)
booknow_visits['year'] = booknow_visits['show_date'].apply(lambda x:x.year)
booknow_visits['day'] = booknow_visits['show_date'].apply(lambda x:x.dayofyear)

booknow_booking2 = booknow_booking.groupby(['book_theater_id','show_date'],as_index=False).agg({'tickets_booked':'sum'})
cinepos_booking2 = cinepos_booking.groupby(['cine_theater_id','show_date'],as_index=False).agg({'tickets_sold':'sum'})

booknow_booking2['week_day'] = booknow_booking2['show_date'].apply(lambda x:x.dayofweek)
booknow_booking2['month'] = booknow_booking2['show_date'].apply(lambda x:x.month)
cinepos_booking2['week_day'] = cinepos_booking2['show_date'].apply(lambda x:x.dayofweek)
cinepos_booking2['month'] = cinepos_booking2['show_date'].apply(lambda x:x.month)

In [5]:
weekday_tcktsold = cinepos_booking2.groupby('week_day').agg({'tickets_sold':'mean'}).astype(int).reset_index()
print("Average Tickets sold on each day in a week")
print(weekday_tcktsold)

Average Tickets sold on each day in a week
   week_day  tickets_sold
0         0             6
1         1             6
2         2             6
3         3             6
4         4             7
5         5             8
6         6             7


In [6]:
weekday_tcktsbook = booknow_booking2.groupby('week_day').agg({'tickets_booked':'mean'}).astype(int).reset_index()
print("Average Tickets booked on each day in a week")
print(weekday_tcktsbook)

Average Tickets booked on each day in a week
   week_day  tickets_booked
0         0              14
1         1              10
2         2              11
3         3              12
4         4              12
5         5              16
6         6              17


In [7]:
month_tcktsold = cinepos_booking2.groupby('month').agg({'tickets_sold':'mean'}).astype(int).reset_index()
print("Average Tickets sold on each month")
print(month_tcktsold)

Average Tickets sold on each month
    month  tickets_sold
0       1             7
1       2             6
2       3             7
3       4             7
4       5             6
5       6             6
6       7             6
7       8             6
8       9             6
9      10             6
10     11             6
11     12            10


In [8]:
month_tcktsbooked = booknow_booking2.groupby('month').agg({'tickets_booked':'mean'}).astype(int).reset_index()
print("Average Tickets booked on each month")
print(month_tcktsbooked)

Average Tickets booked on each month
    month  tickets_booked
0       1              13
1       2              13
2       3              14
3       4              12
4       5              12
5       6              14
6       7              13
7       8               2
8       9               2
9      10              12
10     11              12
11     12              17


This data will help in creating features in the train and test data

In [9]:
booknow_visits['mean_audience'] = (booknow_visits.groupby('book_theater_id')['audience_count'].transform('mean')).astype(int)

Adding a time series Based feature , Cumulative mean of audience count named as prior_info

In [10]:
booknow_visits['prior_info'] = booknow_visits.groupby('book_theater_id')['audience_count'].expanding().mean().shift(1).reset_index(level=0, drop=True)

In [11]:
booknow_visits['prior_info'] = booknow_visits['prior_info'].fillna(0).astype(int)

In [12]:
def tckts_sold_week(x):
    return weekday_tcktsold.iloc[x]['tickets_sold']

In [13]:
def tckts_book_week(x):
    return weekday_tcktsbook.iloc[x]['tickets_booked']


In [14]:
def tckts_sold_month(x):
    return month_tcktsold.iloc[x]['tickets_sold']

In [15]:
def tckts_book_month(x):
    return month_tcktsbooked.iloc[x]['tickets_booked']

Now adding those features in the booknow_visits

In [17]:
booknow_visits['tkt_book_month'] = booknow_visits['month'].apply(lambda x:tckts_book_month(x-1))
booknow_visits['tkt_sold_month'] = booknow_visits['month'].apply(lambda x:tckts_sold_month(x-1))
booknow_visits['tkt_book_week'] = booknow_visits['week_day'].apply(lambda x:tckts_book_week(x))
booknow_visits['tkt_sold_week'] = booknow_visits['week_day'].apply(lambda x:tckts_sold_week(x))

In [18]:
booknow_visits['is_winter'] = booknow_visits['month'].apply(lambda x:1 if x in [1,2,11,12] else 0)
booknow_visits['is_summer'] = booknow_visits['month'].apply(lambda x:1 if x in [3,4,5,6] else 0)
booknow_visits['is_weekend'] = booknow_visits['month'].apply(lambda x:1 if x in [5,6] else 0)

Merging the booknow_visits and booknow theaters to gain more theater information

In [19]:
c1 = pd.merge(booknow_visits,booknow_theaters,on='book_theater_id',how='left')

**Handling missing Values in this**

In [20]:
long_mean = booknow_theaters['longitude'].mean()
lat_mean = booknow_theaters['latitude'].mean()
c1['longitude'] = c1['longitude'].fillna(long_mean)
c1['latitude'] = c1['latitude'].fillna(lat_mean)
theater_type_mode = c1['theater_type'].mode()[0]
theater_area_mode = c1['theater_area'].mode()[0]
c1['theater_type'] = c1['theater_type'].fillna(theater_type_mode)
c1['theater_area'] = c1['theater_area'].fillna(theater_type_mode)

Adding some lag features , lag1 and lag7

In [21]:
c1['lag1'] = c1['audience_count'].shift(1)
c1['lag7'] = c1['audience_count'].shift(6)
c1['lag1'] = c1['lag1'].fillna(0)
c1['lag7'] = c1['lag7'].fillna(0)

Corelations and feature testing with different techniques such as spearmans , pointbiserial and kruskal wallis tests

In [24]:
corr , p = spearmanr(c1['week_day'],c1['audience_count'])
print(f'Correlation of week day with target Audience count is {corr} with a p value {p}')

Correlation of week day with target Audience count is 0.1047746650551535 with a p value 0.0


Weak correlation but significant

In [25]:
grps  = [c1[c1['week_day']==d]['audience_count'] for d in [0,1,2,3,4,5,6]]
H, p = kruskal(*grps)
print(f" With krukals wallis test H-statistic: {H} p-value {p}")

 With krukals wallis test H-statistic: 7089.971042453384 p-value 0.0


So week day might be a strong predictor

In [26]:
grps2  = [c1[c1['month']==m]['audience_count'] for m in [1,2,3,4,5,6,7,8,9,10,11,12]]
H, p = kruskal(*grps2)
print(f"For month and Audience count the H-statistic is {H} p-value  {p}")

For month and Audience count the H-statistic is 849.2486638651989 p-value  5.009299060796749e-175


month has a strong seasonal effect on audience behavior, making it an important predictive feature.

In [27]:
fs = ['tkt_book_month',
       'tkt_sold_month', 'tkt_book_week', 'tkt_sold_week']
for f in fs:
    groups = [g['audience_count'] for _, g in c1.groupby(f)]
    H, p = kruskal(*groups)
    print(f'for feature {f} H-statistic {H} p value {p}')

for feature tkt_book_month H-statistic 516.005261657033 p value 2.3130514344434994e-110
for feature tkt_sold_month H-statistic 354.6791925399063 p value 9.602668067733754e-78
for feature tkt_book_week H-statistic 7085.75808651096 p value 0.0
for feature tkt_sold_week H-statistic 2291.025692515295 p value 0.0


All four features (monthly/weekly booked and sold tickets) show extremely strong statistical differences across groups, meaning they strongly influence audience_count.

->Weekly features have much larger H-statistics, indicating weekly seasonality patterns are far more impactful than monthly patterns.

->Overall, these features carry high predictive power and should be kept in your model for capturing demand trends.

In [28]:
bin_fs = ['is_winter',
       'is_summer', 'is_weekend']
for f in bin_fs:
    corr, p = pointbiserialr(c1['audience_count'], c1[f])
    print(f'for feature {f} , correlation is {corr} p_value {p}')

for feature is_winter , correlation is -0.002123191299684027 p_value 0.3259559809748414
for feature is_summer , correlation is 0.03557071195324725 p_value 6.885715138079897e-61
for feature is_weekend , correlation is 0.014993670105597993 p_value 3.9998503132746e-12


->is_summer and is_weekend show statistically significant positive correlations with audience_count, meaning audience levels tend to be higher in summer and on weekends.

->is_winter shows no significant relationship (very small correlation, high p-value), indicating that winter does not meaningfully affect audience_count.