In [1]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import datetime

In [2]:
dfvid = pd.read_csv('../data/original/DEvideos_cc50_202101.csv')
dfvid.dtypes

video_id                   object
trending_date              object
title                      object
channel_title              object
category_id                 int64
publish_time               object
tags                       object
views                       int64
likes                       int64
dislikes                    int64
comment_count               int64
thumbnail_link             object
comments_disabled            bool
ratings_disabled             bool
video_error_or_removed       bool
description                object
state                      object
lat                       float64
lon                       float64
geometry                   object
dtype: object

In [3]:
#Aqui leemos el archivo json y usamos la función json_normalize para que las categorias estén al mismo nivel
with open('../data/original/DE_category_id.json','r') as f:
    data = json.loads(f.read())
dfcatDE = pd.json_normalize(data,record_path=['items'])
with open('../data/original/US_category_id.json','r') as f:
    data = json.loads(f.read())
dfcatUS = pd.json_normalize(data,record_path=['items'])
dfcat = pd.concat([dfcatDE,dfcatUS])
dfcat.dtypes

kind                  object
etag                  object
id                    object
snippet.channelId     object
snippet.title         object
snippet.assignable      bool
dtype: object

In [4]:
#Limpiamos primero el dataset de dfcat
#eliminamos kind y etag
dfcat = dfcat.drop(['kind','etag','snippet.channelId','snippet.assignable'],axis=1, errors='ignore')
#convertimos el id a int, para hacer join con el otro dataframe
dfcat = dfcat.astype({'id':int})
#eliminamos las filas repetidas
dfcat.drop_duplicates(keep='first',inplace=True)
dfcat.dtypes

id                int32
snippet.title    object
dtype: object

In [5]:
#Unimos los datasets
dfvid = dfvid.astype({'category_id':int})
dfvid = pd.merge(dfvid,dfcat,how='inner',left_on='category_id',right_on='id').drop(['id','category_id'],axis=1)
dfvid.dtypes

video_id                   object
trending_date              object
title                      object
channel_title              object
publish_time               object
tags                       object
views                       int64
likes                       int64
dislikes                    int64
comment_count               int64
thumbnail_link             object
comments_disabled            bool
ratings_disabled             bool
video_error_or_removed       bool
description                object
state                      object
lat                       float64
lon                       float64
geometry                   object
snippet.title              object
dtype: object

In [6]:
#Ahora convertimos los datos 
dfvid['video_id'] = dfvid['video_id'].astype(str)
dfvid['title'] = dfvid['title'].astype(str)
dfvid['channel_title'] = dfvid['channel_title'].astype(str)
dfvid['tags'] = dfvid['tags'].astype(str)
dfvid['thumbnail_link'] = dfvid['thumbnail_link'].astype(str)
dfvid['description'] = dfvid['description'].astype(str)
dfvid['state'] = dfvid['state'].astype(str)
dfvid['snippet.title'] = dfvid['snippet.title'].astype(str)
dfvid['trending_date'] = pd.to_datetime(dfvid['trending_date'], format="%y.%d.%m")
dfvid['publish_time'] = pd.to_datetime(dfvid['publish_time'])
dfvid

Unnamed: 0,video_id,trending_date,title,channel_title,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,state,lat,lon,geometry,snippet.title
0,LgVi6y5QIjM,2017-11-14,Sing zu Ende! | Gesangseinlagen vom Feinsten |...,inscope21,2017-11-13 17:08:49+00:00,"inscope21|""sing zu ende""|""gesangseinlagen""|""ge...",252786,35885,230,1539,https://i.ytimg.com/vi/LgVi6y5QIjM/default.jpg,False,False,False,Heute gibt es mal wieder ein neues Format... w...,Schleswig Holstein,54.783748,9.433315,POINT (9.433315388 54.78374778),Entertainment
1,1ZAPwfrtAFY,2017-11-14,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,2017-11-13 07:30:00+00:00,"last week tonight trump presidency|""last week ...",2418783,97190,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John...",Saarland,49.250390,6.970003,POINT (6.970003213 49.25039044),Entertainment
2,ZJ9We4bjcg0,2017-11-14,18 SONGS mit Kelly MissesVlog (Sing-off),rezo,2017-11-12 13:10:36+00:00,"kelly|""missesvlog""|""kelly song""|""bausa""|""bausa...",822213,100684,2467,10244,https://i.ytimg.com/vi/ZJ9We4bjcg0/default.jpg,False,False,False,18 Song Mashup über den (veränderten) Beat von...,Berlin,52.521819,13.401549,POINT (13.40154862 52.52181866),Entertainment
3,2hu_evXPpMM,2017-11-14,Dagi Bee wird Heiraten | Coldmirror bekommt Eh...,HerrNewstime,2017-11-12 16:33:18+00:00,"Dagi Bee|""Heiraten""|""Coldmirror""|""YouTube""|""Tr...",228574,11349,990,1049,https://i.ytimg.com/vi/2hu_evXPpMM/default.jpg,False,False,False,Dagi Bee wird Heiraten | Coldmirror bekommt Eh...,Thuringen,50.970053,11.029962,POINT (11.02996212 50.97005292),Entertainment
4,OKYUtHvgMhc,2017-11-14,Die Welt in 30 Jahren - Sommer 2047 - VOLKAN,VOLKAN,2017-11-12 16:09:44+00:00,"Welt|""Natur""|""Umwelt""|""Klima""|""Klimawandel""",37877,1839,327,170,https://i.ytimg.com/vi/OKYUtHvgMhc/default.jpg,False,False,False,"Über die Zukunft nachzudenken macht Spaß, oder...",Sachsen,50.829984,12.919976,POINT (12.91997595 50.82998395),Entertainment
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40835,YvlYjLPgrCE,2018-06-11,Street Food in Ghana - GIANT CHOP-BAR LUNCH an...,Mark Wiens,2018-06-10 13:00:08+00:00,"Ghana|""Ghana food""|""Ghanaian food""|""fufu""|""ban...",304116,11398,333,1883,https://i.ytimg.com/vi/YvlYjLPgrCE/default.jpg,False,False,False,This was an incredible day of street food in G...,Nordrhein Westfalen,52.029988,8.530011,POINT (8.530011351000001 52.02998822),Travel & Events
40836,GZa2B-knFGw,2018-06-14,Village Food in West Africa - BEST FUFU and EX...,Mark Wiens,2018-06-13 13:00:04+00:00,"ghana|""fufu""|""West Africa""|""African food""|""Gha...",252328,13557,235,2253,https://i.ytimg.com/vi/GZa2B-knFGw/default.jpg,False,False,False,This was one of the most rewarding experiences...,Schleswig Holstein,54.783748,9.433315,POINT (9.433315388 54.78374778),Travel & Events
40837,riupcv-tf1s,2018-03-25,Fack Ju Göhte 3 - Trailer,Fack Ju Göhte 3 - Trailer,2018-03-08 22:42:50+00:00,[none],8804,0,0,0,https://i.ytimg.com/vi/riupcv-tf1s/default.jpg,True,False,False,"Homo Faber, Kurvendiskussion, Asbest in den To...",Niedersachsen,53.129999,8.220004,POINT (8.220004434 53.1299986),Trailers
40838,l7E0kTvARsA,2018-06-01,Golak Bugni Bank Te Batua Full Movie (HD) | Ha...,Rhythm Boyz,2018-05-31 13:30:04+00:00,"full movie|""full punjabi movie""|""golak""|""bugni...",781977,22027,783,1096,https://i.ytimg.com/vi/l7E0kTvARsA/default.jpg,False,False,False,Rhythm Boyz Entertainment & Hayre Omjee Studio...,Rheinland Pfalz,49.982472,8.273219,POINT (8.273219156 49.98247246),Movies


In [7]:
dfvid.isnull().sum() 

video_id                  0
trending_date             0
title                     0
channel_title             0
publish_time              0
tags                      0
views                     0
likes                     0
dislikes                  0
comment_count             0
thumbnail_link            0
comments_disabled         0
ratings_disabled          0
video_error_or_removed    0
description               0
state                     0
lat                       0
lon                       0
geometry                  0
snippet.title             0
dtype: int64

In [9]:
#Normalización
#Vamos a crear un nuevo dataframe a base del actual, que tenga los comentarios activados 
dfnew = dfvid[(dfvid['comments_disabled']==False) & (dfvid['ratings_disabled']==False)]
#Ahora analizamos las columnas: 
'videos con views en 0: '+str(dfnew[(dfnew['views']==0)].count().sum())+' | videos con comment_count en 0: '+str(dfnew[(dfnew['comment_count']==0)].count().sum())+' | videos con dislikes en 0: '+str(dfnew[(dfnew['dislikes']==0)].count().sum())+' | videos con likes en 0: '+str(dfnew[(dfnew['likes']==0)].count().sum())


'videos con views en 0: 0 | videos con comment_count en 0: 5440 | videos con dislikes en 0: 4960 | videos con likes en 0: 120'

In [10]:
dfnew.to_csv('../data/moded/dfvideos_comentarios_valoraciones.csv')
