In [6]:
cd C:\Users\Abhishek Singh\Downloads\MovieData

C:\Users\Abhishek Singh\Downloads\MovieData


In [7]:
import pandas as pd

# Path to your Parquet file
file_path = '0000.parquet'

# Read the Parquet file
df = pd.read_parquet(file_path, engine='pyarrow')

# Display the DataFrame
print(df)

            videoid                                         contentUrl  \
0          21179416  https://ak.picdn.net/shutterstock/videos/21179...   
1           5629184  https://ak.picdn.net/shutterstock/videos/56291...   
2        1063125190  https://ak.picdn.net/shutterstock/videos/10631...   
3        1039695998  https://ak.picdn.net/shutterstock/videos/10396...   
4           9607838  https://ak.picdn.net/shutterstock/videos/96078...   
...             ...                                                ...   
1792495  1021499794  https://ak.picdn.net/shutterstock/videos/10214...   
1792496  1033753421  https://ak.picdn.net/shutterstock/videos/10337...   
1792497  1017392800  https://ak.picdn.net/shutterstock/videos/10173...   
1792498  1032128717  https://ak.picdn.net/shutterstock/videos/10321...   
1792499  1010553644  https://ak.picdn.net/shutterstock/videos/10105...   

            duration       page_dir  \
0        PT00H00M11S  006001_006050   
1        PT00H00M29S  071501_0715

In [None]:
import os
import requests
import csv


def download_videos(df, download_folder="videos_1000", max_rows=1000):
  """
  Downloads videos from URLs in a dataframe and creates a new CSV file, skipping videos with encoding errors.

  Args:
    df: The pandas dataframe containing video data.
    download_folder: The folder where downloaded videos will be saved. (default: "videos_1000")
    max_rows: The maximum number of rows to process. (default: 1000)
  """
  # Create download folder if it doesn't exist
  os.makedirs(download_folder, exist_ok=True)

  # Open new CSV file for writing
  with open("downloaded_videos.csv", "w",encoding = "utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["video_id", "video_path", "description"])

    # Iterate over a maximum of 1000 rows
    for index, row in df.iloc[:max_rows].iterrows():
      video_id = row["videoid"]
      video_url = row["contentUrl"]
      video_name = row["name"]

      # Generate filename with extension
      filename = f"{video_id}.mp4"  # Adjust extension if needed based on video type
      video_path = os.path.join(download_folder, filename)

      # Check if video file already exists
      if not os.path.isfile(video_path):
        try:
          # Download video
          response = requests.get(video_url, stream=True)
          if response.status_code == 200:
            with open(video_path, 'wb') as f:
              for chunk in response.iter_content(1024):
                f.write(chunk)
            print(f"Downloaded video: {video_id}")
          else:
            print(f"Failed to download video: {video_id} - Status Code: {response.status_code}")
        except Exception as e:
          print(f"Error downloading video: {video_id} - {e}")
      else:
        print(f"Skipping video: {video_id} (already exists)")

      try:
        # Encode and decode video name for CSV (handle potential errors)
        decoded_name = video_name.encode('utf-8').decode('latin-1')  # Assuming cp1252 is your current encoding
      except UnicodeEncodeError:
        print(f"Skipping video name encoding for video: {video_id} (potential encoding issue)")
        decoded_name = "Encoding Error"  # Use a placeholder for videos with encoding errors

      # Write data to CSV regardless of download or encoding status
      writer.writerow([video_id, video_path, decoded_name])

# Example usage (assuming your dataframe is named 'df')
download_videos(df)

Downloaded video: 21179416
Downloaded video: 5629184
Downloaded video: 1063125190
Downloaded video: 1039695998
Downloaded video: 9607838
Downloaded video: 21157780
Downloaded video: 1016180245
Downloaded video: 19107358
Downloaded video: 1036840850
Downloaded video: 13347398
Downloaded video: 1026436784
Downloaded video: 9000853
Downloaded video: 1047392872
Downloaded video: 30436075
Downloaded video: 6248114
Downloaded video: 1041349231
Downloaded video: 22517557
Downloaded video: 10807676
Downloaded video: 3280088
Downloaded video: 1022584264
Downloaded video: 1032669275
Downloaded video: 1016442058
Downloaded video: 5770451


In [16]:
import pandas as pd

def process_downloaded_videos(csv_file="downloaded_videos.csv"):
  """
  Reads downloaded video information from a CSV file and prints the first 10 rows.

  Args:
    csv_file: The path to the CSV file containing video data. (default: "downloaded_videos.csv")
  """
  # Read the CSV file into a temporary dataframe
  try:
    temp_df = pd.read_csv(csv_file)
  except FileNotFoundError:
    print(f"Error: CSV file '{csv_file}' not found.")
    return

  # Print the first 10 rows (or all rows if less than 10)
  if len(temp_df) < 10:
    print(temp_df)
  else:
    print(temp_df.head(10))

# Example usage
process_downloaded_videos()


     video_id                  video_path  \
0    21179416    videos_1000\21179416.mp4   
1     5629184     videos_1000\5629184.mp4   
2  1063125190  videos_1000\1063125190.mp4   
3  1039695998  videos_1000\1039695998.mp4   
4     9607838     videos_1000\9607838.mp4   
5    21157780    videos_1000\21157780.mp4   
6  1016180245  videos_1000\1016180245.mp4   
7    19107358    videos_1000\19107358.mp4   
8  1036840850  videos_1000\1036840850.mp4   
9    13347398    videos_1000\13347398.mp4   

                                         description  
0                          Aerial shot winter forest  
1  Senior couple looking through binoculars on sa...  
2  A beautiful cookie with oranges lies on a gree...  
3  Japanese highrise office skyscrapers tokyo square  
4  Zrenjanin,serbia march 21 2015: fans watching ...  
5     Young beautiful woman using smartphone in cafe  
6  3d render of inky injections into water with l...  
7  Swimming in the pool ,slow motion 120 fps,hand...  
8  Circa 

In [None]:
import csv


def fix_csv_empty_rows(csv_file="downloaded_videos.csv", output_file="fixed_downloaded_videos.csv"):
  """
  Reads a CSV file with empty rows and writes a new file without empty rows.

  Args:
    csv_file: The path to the CSV file with empty rows. (default: "downloaded_videos.csv")
    output_file: The path to the output CSV file without empty rows. (default: "fixed_downloaded_videos.csv")
  """
  # Open the input CSV file for reading
  with open(csv_file, "r", encoding="utf-8") as infile, open(output_file, "w", newline="", encoding="latin-1") as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Write header row if it exists
    try:
      header = next(reader)
      writer.writerow(header)
    except StopIteration:  # Handle empty file case
      pass

    # Write non-empty rows to the output file
    for row in reader:
      if row:  # Check if the row is not empty
        writer.writerow(row)

  print(f"Fixed CSV file written to: {output_file}")


# Example usage
fix_csv_empty_rows()

In [None]:
import os
import csv


def verify_downloaded_videos(download_folder="videos_1000", csv_file="fixed_downloaded_videos.csv"):
  """
  Verifies if all videos in the download folder have a corresponding entry in the CSV file.

  Args:
    download_folder: The path to the folder containing downloaded videos. (default: "videos_1000")
    csv_file: The path to the CSV file containing video data. (default: "fixed_downloaded_videos.csv")
  """
  # Get a list of video filenames from the download folder
  video_files = [f for f in os.listdir(download_folder) if os.path.isfile(os.path.join(download_folder, f))]

  # Set to store video IDs found in the CSV
  video_ids_in_csv = set()

  # Open the CSV file for reading
  try:
    with open(csv_file, "r", encoding="utf-8") as csvfile:
      reader = csv.reader(csvfile)
      next(reader, None)  # Skip the header row
      for row in reader:
        video_id = row[0]  # Assuming video ID is the first column
        video_ids_in_csv.add(video_id)
  except FileNotFoundError:
    print(f"Error: CSV file '{csv_file}' not found.")
    return
  except UnicodeDecodeError:
    print(f"Error: Unable to decode CSV file '{csv_file}' using UTF-8. Consider using a different encoding.")
    return

  # Find missing videos (not in CSV but present in folder)
  missing_videos = [f.split(".")[0] for f in video_files if f.split(".")[0] not in video_ids_in_csv]

  # Print results
  if missing_videos:
    print(f"Missing video entries in CSV:")
    for video in missing_videos:
      print(f"- {video}")
  else:
    print(f"All videos in '{download_folder}' have corresponding entries in '{csv_file}'.")


# Verify downloaded videos
verify_downloaded_videos()
