In [None]:
from googleapiclient.discovery import build
import isodate
from pymongo import MongoClient
import certifi, re
import warnings
from datetime import datetime, timezone  # Ensure timezone is imported
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression, chi2
warnings.filterwarnings("ignore")


In [None]:

con = "mongodb+srv://anjalijha1507:U54OU4PFxPYlVc4S@youtubedata.shzzp.mongodb.net/?retryWrites=true&w=majority&appName=YoutubeData"
# Create a MongoClient instance with CA bundle specified
client = MongoClient(con, tls=True, tlsCAFile=certifi.where())

# Attempt to get server information to confirm connection
client.server_info()  # Forces a call to the server
print("Successfully connected to MongoDB.")

# # Access a specific database (replace 'test' with your database name)
db = client['Project1']


In [None]:
youtube = build('youtube', 'v3', developerKey='AIzaSyBPJ64uexibg77DCSd6rSGU8loyOTvndjI')
# user_input = input('Enter the start date in YYYY-MM-DD format (data will be fetched from this date onwards): ')
user_input = '2022-12-01'
try:
    start_date = datetime.strptime(user_input, '%Y-%m-%d').replace(tzinfo=timezone.utc) if user_input else None
except ValueError:
    print('Invalid date format. Please use YYYY-MM-DD format.')
    exit()

In [None]:
def get_channel_id_from_video_url(video_url):
    try:
        # Extract the video ID from the URL using regular expression
        video_id_match = re.search(r'(?:youtu.be\/|youtube.com\/(?:v|e(?:mbed)?)\/|youtube.com\/watch\?v=)([a-zA-Z0-9_-]{11})', video_url)
        
        if video_id_match:
            video_id = video_id_match.group(1)
            print(f"Video ID extracted: {video_id}")

            # Request to get video details
            request = youtube.videos().list(
                part="snippet",
                id=video_id
            )

            # Execute the request
            response = request.execute()

            # Extract channel ID from the response
            if 'items' in response and len(response['items']) > 0:
                channel_id = response['items'][0]['snippet']['channelId']
                print(f"Channel ID for the video is: {channel_id}")
                return channel_id
            else:
                print("No channel found for this video.")
        else:
            print("Invalid YouTube video URL.")
    
    except Exception as e:
        print(f"An error occurred: {e}")

def get_video_statistics(video_id):
    try:
        video_response = youtube.videos().list(part='statistics,contentDetails', id=video_id).execute()
        video_info = video_response.get('items', [])[0] if 'items' in video_response else {}
        stats = video_info.get('statistics', {})
        details = video_info.get('contentDetails', {})
        
        likes = int(stats.get('likeCount', 0))
        comments = int(stats.get('commentCount', 0))
        duration = details.get('duration', 'PT0S')
        duration_seconds = parse_duration_to_seconds(duration)
        
        return likes, comments, duration_seconds
    except Exception as e:
        print(f'Error fetching video data for ID {video_id}: {e}')
        return 0, 0, 0

def parse_duration_to_seconds(duration):
    try:
        duration = isodate.parse_duration(duration)
        return int(duration.total_seconds())
    except Exception as e:
        print(f'Error parsing duration {duration}: {e}')
        return 0

def get_channel_statistics(channel_id, user_start_date):
    try:

        # Check if the channel_id already exists in MongoDB
        existing_channel = db['youtube_channel_data'].find_one({'channel_id': channel_id})
        if existing_channel:
            print(f"Channel ID {channel_id} already exists. Skipping data fetch.")
            return  # Skip fetching data if it already exists

        channel_response = youtube.channels().list(part='snippet,contentDetails,statistics', id=channel_id).execute()
        channel_info = channel_response.get('items', [])[0] if 'items' in channel_response else {}
        print("uuuuuuuu")
        # print(channel_info)
        
        if channel_info:
            snippet = channel_info.get('snippet', {})
            statistics = channel_info.get('statistics', {})
            content_details = channel_info.get('contentDetails', {})
            
            # Fetch channel start date
            channel_start_date = snippet.get('publishedAt', 'NA')
            channel_start_date = datetime.fromisoformat(channel_start_date.replace('Z', '+00:00'))
            
            # Use the user start date for filtering
            # start_date = user_start_date if user_start_date else channel_start_date
            # start_date = channel_start_date
            # Get upload playlist ID
            uploads_playlist_id = content_details.get('relatedPlaylists', {}).get('uploads', 'NA')
            
            if uploads_playlist_id == 'NA':
                print(f'No uploads playlist found for channel ID {channel_id}.')
                return
            
            total_likes = 0
            total_comments = 0
            short_videos_count = 0
            long_videos_count = 0
            
            next_page_token = None
            while True:
                playlist_items_response = youtube.playlistItems().list(
                    part='contentDetails,snippet',
                    playlistId=uploads_playlist_id,
                    maxResults=50,
                    pageToken=next_page_token
                ).execute()
                
                items = playlist_items_response.get('items', [])
                for item in items:
                    video_id = item['contentDetails']['videoId']
                    video_upload_date = item['snippet'].get('publishedAt', 'NA')
                    video_upload_date = datetime.fromisoformat(video_upload_date.replace('Z', '+00:00'))
                    
                    if video_upload_date >= start_date:
                        likes, comments, duration_seconds = get_video_statistics(video_id)
                        total_likes += likes
                        total_comments += comments
                        if duration_seconds < 60:
                            short_videos_count += 1
                        else:
                            long_videos_count += 1
                
                next_page_token = playlist_items_response.get('nextPageToken')
                if not next_page_token:
                    break
            
            # Define the nested structure
            channel_data = {
                'channel_id': channel_id,
                'channel_details': {
                    'channel_name': snippet.get('title', 'NA'),
                    'channel_start_date': channel_start_date.isoformat(),
                    'inception_date' : start_date.isoformat(),
                    'total_no_of_videos': statistics.get('videoCount', 'NA'),
                    'total_no_short_videos': short_videos_count,
                    'total_no_long_videos': long_videos_count,
                    'total_views': statistics.get('viewCount', 'NA'),
                    'total_likes': total_likes,
                    'total_comments': total_comments,
                    'total_subscribers': statistics.get('subscriberCount', 'NA'),

                }
            }

            # Insert or update the data in MongoDB
            collection = db['youtube_channel_data']  # Replace with your collection name
            collection.update_one(
                {'channel_id': channel_id},  # Use channel_id as unique identifier
                {'$set': channel_data},
                upsert=True
            )
            print(channel_data)
            print(f'Data for channel ID {channel_id} inserted/updated in MongoDB.')
        else:
            print(f'Channel with ID {channel_id} not found or no data available.')

    except Exception as e:
        print(f'Error fetching channel data for ID {channel_id}: {e}')



In [None]:

link = 'https://www.youtube.com/watch?v=p7V4Aa7qEpw'
CHANNEL_IDS = get_channel_id_from_video_url(link)
get_channel_statistics(CHANNEL_IDS, start_date)

In [None]:
# CHANNEL_IDS = ['UCpAbD88ier5LH7p4f1aFmyQ']
# # Fetch statistics for each channel and store in MongoDB
# for channel_id in CHANNEL_IDS:
#     print(f'Channel with ID {channel_id}')
#     get_channel_statistics(CHANNEL_IDS, start_date)

In [None]:
collection = db['youtube_channel_data'] 
# Define the fields to retrieve
fields = {
    'channel_id': 1,
    'channel_details.channel_name': 1,
    'channel_details.channel_start_date': 1,
    'channel_details.inception_date': 1,
    'channel_details.total_no_of_videos': 1,
    'channel_details.total_no_short_videos': 1,
    'channel_details.total_no_long_videos': 1,
    'channel_details.total_views': 1,
    'channel_details.total_likes': 1,
    'channel_details.total_comments': 1,
    'channel_details.total_subscribers': 1
}

# Fetch documents and project the required fields
documents = collection.find({}, {field: 1 for field in fields})

# Convert documents to a list of dictionaries
data = list(documents)

# Normalize nested data for DataFrame
df_data = pd.json_normalize(data, sep='_')

# Rename columns to remove 'channel_details_' prefix
df_data.columns = df_data.columns.str.replace('channel_details_', '', regex=False)

# Convert date fields to YYYY-MM-DD format
date_columns = ['channel_start_date', 'inception_date']
for column in date_columns:
    # Convert to datetime, handling potential microseconds
    df_data[column] = pd.to_datetime(df_data[column].str.replace(r'\.\d+', '', regex=True)).dt.strftime('%Y-%m-%d')

# Drop the '_id' column if it exists
df_data.drop('_id', axis=1, inplace=True, errors='ignore')

# Save to CSV
df_data.to_csv("Raw_Youtube_API_DATA.csv", index=False)

In [None]:
df_data = pd.read_csv("Raw_Youtube_API_DATA.csv")
df_data.head()

In [None]:
df_data['total_views'] = pd.to_numeric(df_data['total_views'], errors="coerce")
df_data['total_likes'] = pd.to_numeric(df_data['total_likes'], errors="coerce")
df_data['total_comments'] = pd.to_numeric(df_data['total_comments'], errors="coerce")
df_data['total_subscribers'] = pd.to_numeric(df_data['total_subscribers'], errors="coerce")
df_data['total_no_of_videos'] = pd.to_numeric(df_data['total_no_of_videos'], errors="coerce")
df_data['total_no_short_videos'] = pd.to_numeric(df_data['total_no_short_videos'], errors="coerce")
df_data['total_no_long_videos'] = pd.to_numeric(df_data['total_no_long_videos'], errors="coerce")

In [None]:
df_data.shape

In [None]:
null_rows = df_data[df_data.isnull().any(axis=1)]
print(null_rows)
df_data.isnull().sum()

In [None]:
df_data['channel_start_date'] = pd.to_datetime(df_data['channel_start_date'], errors="coerce")
df_data['inception_date'] = pd.to_datetime(df_data['inception_date'], errors="coerce")

In [None]:
# Set reference date to now in UTC
reference_date = pd.to_datetime(pd.Timestamp.now()).tz_localize('UTC')
# Convert the channel start date and inception date to UTC
df_data['channel_start_date'] = pd.to_datetime(df_data['channel_start_date']).dt.tz_localize('UTC')
df_data['inception_date'] = pd.to_datetime(df_data['inception_date']).dt.tz_localize('UTC')
# Now calculate days since start and inception
df_data['days_since_start'] = (reference_date - df_data['channel_start_date']).dt.days
df_data['days_since_inception'] = (reference_date - df_data['inception_date']).dt.days

In [None]:
df_data.describe()

In [None]:
df_data.info()

SELECTKBEST Feature for DATASET

In [None]:
x = df_data.drop(['channel_id', 'channel_name', 'channel_start_date', 'inception_date', 'total_subscribers'], axis=1)
y = df_data['total_subscribers']

In [None]:
best_feature = SelectKBest(score_func = chi2, k = 11)
fit = best_feature.fit(x, y)


In [None]:
score_df = pd.DataFrame(fit.scores_, columns=['score'])
column_df = pd.DataFrame(x.columns, columns=['Feature'])

In [None]:
feature_score_df = pd.concat([column_df, score_df], axis=1)
# Convert the 'score' column to numeric
feature_score_df['score'] = pd.to_numeric(feature_score_df['score'])
pd.set_option('display.float_format', '{:.2f}'.format)
# Display the DataFrame
print(feature_score_df)

HIGHER THE SCORE = More important the feature is!

total_views, total_likes, total_comments are the most important features in predicting the number of subscribers.

In [None]:
print(feature_score_df.nlargest(8, 'score'))

In [None]:
from sklearn.feature_selection import VarianceThreshold
var = VarianceThreshold(threshold=0)
var.fit(x)



In [None]:
var.get_support()

In [None]:
def correlation(df, threshold):
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

In [None]:
correlation(x, 0.6)

In [None]:
x.corr()

In [None]:
plt.figure(figsize=(12, 10))
cor = x.corr()
sns.heatmap(cor, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1, center=0)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
mi = mutual_info_regression(X_train, y_train)

In [None]:
mi

In [None]:
mi = pd.Series(mi)
mi.index = x.columns
mi.sort_values(ascending=False)

In [None]:
from sklearn.ensemble import RandomForestRegressor


rf = RandomForestRegressor(n_estimators=21, random_state=23)
rf.fit(X_train[['total_views', 'total_likes', 'total_comments', 'total_no_of_videos', 'total_no_long_videos', 'days_since_start']], y_train)

In [None]:
rf.score(X_test[['total_views', 'total_likes', 'total_comments', 'total_no_of_videos', 'total_no_long_videos', 'days_since_start']], y_test)