# **YouTube Movies Analysis**

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/movies-youtube-trailers-and-sentimentdinesh-dinesh/movies_youtube_sentiments.csv')
df.head()

## **1. Basic Information:**

### What are the columns in the dataset?

In [None]:
df.columns

### How many rows are there in the dataset?

In [None]:
print('Total rows in dataset are',df.shape[0])

### What is the datatype of each column?

In [None]:
df.info()

## **2. Data Cleaning:**

### Are there any missing values in the dataset? If so, how are they distributed across columns?

In [None]:
df.isnull().sum()

### Are there any duplicate rows in the dataset?

In [None]:
df[df.duplicated()]

## **3. Data Understanding:**

### What is the distribution of movies across different genres?

In [None]:
pd.DataFrame(df.groupby('genre'))[0]

### How many unique directors, writers, and stars are there in the dataset?

In [None]:
pd.DataFrame(df.groupby('director'))[0]

In [None]:
pd.DataFrame(df.groupby('writer'))[0]

In [None]:
pd.DataFrame(df.groupby('star'))[0]

### What is the distribution of sentiment scores (positive, neutral, negative)?

In [None]:
import ast

def convert_to_dict(text):
    return ast.literal_eval(text)

df['sentiment_scores'] = df['sentiment_scores'].apply(convert_to_dict)

def extract_sentiment_counts(row):
    return row['positive'], row['negative'], row['neutral']

df[['positive_count', 'negative_count', 'neutral_count']] = df['sentiment_scores'].apply(extract_sentiment_counts).apply(pd.Series)

total_positive = df['positive_count'].sum()
total_negative = df['negative_count'].sum()
total_neutral = df['neutral_count'].sum()

print("Total Positive:", total_positive)
print("Total Negative:", total_negative)
print("Total Neutral:", total_neutral)

### What is the distribution of favorability scores?

In [None]:
def calculate_favorability_score(row):
    total_count = sum(row.values())
    return (row['positive'] - row['negative']) / total_count if total_count > 0 else 0

df['favorability'] = df['sentiment_scores'].apply(calculate_favorability_score)

df['favorability'].plot(kind='hist', bins=10, color='skyblue', edgecolor='black')

plt.xlabel('Favorability Score')
plt.ylabel('Frequency')
plt.title('Distribution of Favorability Scores')

plt.show()

### How are sentiment scores correlated with favorability scores?

In [None]:
df['sentiment_scores_sum'] = df['sentiment_scores'].apply(lambda x: sum(x.values()))

correlation = df['sentiment_scores_sum'].corr(df['favorability'])

print("Correlation between sentiment_scores and favorability:", correlation)

## **4. Exploratory Visualization:**

### Plot a histogram of sentiment scores.

In [None]:
df['sentiment_scores'] = df['sentiment_scores'].apply(lambda x: sum(x.values()))

plt.hist(df['sentiment_scores'], bins=5, color='skyblue', edgecolor='black')

plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.title('Histogram of Sentiment Scores')

plt.show()

### Plot a bar chart showing the number of movies released each year.

In [None]:
movies_per_year = df['year'].value_counts().sort_index()

plt.bar(movies_per_year.index, movies_per_year.values, color='skyblue', edgecolor='black')

plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.title('Number of Movies Released Each Year')
plt.show()

### Visualize the distribution of favorability scores using a box plot

In [None]:
sns.boxplot(df['favorability'],orient='y',color='darkblue')
plt.show()

### Plot a scatter plot showing the relationship between budget and gross revenue.

In [None]:
plt.scatter(df['budget'],df['gross'])
plt.xlabel('Budget')
plt.ylabel('Gross')
plt.title('Relationship between budget and gross')
plt.show()

## **5. Advanced Analysis:**

### Is there a correlation between sentiment scores and box office performance (e.g., gross revenue)?

In [None]:
def aggregate_sentiment_score(score):
    if isinstance(score, dict):
        return score.get('positive', 0) - score.get('negative', 0)
    else:
        return 0
df['aggregated_sentiment_score'] = df['sentiment_scores'].apply(aggregate_sentiment_score)

correlation = df['aggregated_sentiment_score'].corr(df['gross'])

print("Correlation between aggregated sentiment scores and box office performance:", correlation)

### Are certain genres more likely to receive positive sentiment scores?

In [None]:
def aggregate_sentiment_score(score):
    if isinstance(score, dict):
        return score.get('positive', 0) - score.get('negative', 0)
    else:
        return 0

df['aggregated_sentiment_score'] = df['sentiment_scores'].apply(aggregate_sentiment_score)

genre_sentiment_avg = df.groupby('genre')['aggregated_sentiment_score'].mean()

print("Average sentiment score by genre:")
print(genre_sentiment_avg)

### Are movies with higher budgets more likely to have positive sentiment scores?

In [None]:
def aggregate_sentiment_score(score):
    if isinstance(score, dict):
        return score.get('positive', 0) - score.get('negative', 0)
    else:
        return None

df['aggregated_sentiment_score'] = df['sentiment_scores'].apply(aggregate_sentiment_score)

df_filtered = df.dropna(subset=['aggregated_sentiment_score'])

correlation = df_filtered['budget'].corr(df_filtered['aggregated_sentiment_score'])

print("Correlation between budget and sentiment score:", correlation)