En este archivo puedes escribir lo que estimes conveniente. Te recomendamos detallar tu solución y todas las suposiciones que estás considerando. Aquí puedes ejecutar las funciones que definiste en los otros archivos de la carpeta src, medir el tiempo, memoria, etc.

In [51]:
import os

file_path = r"\Users\DZWorld2\Documents\Studying\challenge_option\tweets.json\farmers-protest-tweets-2021-2-4.json"
# Ensure the file path is correctly formatted for Windows
file_path = file_path.replace("\\", "/")

# Check if the file exists
if os.path.exists(file_path):
    print("The file exists.")
else:
    
    print("The file does not exist.")

The file exists.


In [49]:
from datetime import datetime
import json
from collections import defaultdict, Counter
from typing import List, Tuple, Generator, Dict
# We this function to read the tweets from the file using a generator because im going to supose that the file is large and we dont want to load all the tweets into memory at once make it more memory efficient
def read_tweets(file_path: str):
    with open(file_path, 'r') as file:
        for line in file:
            
            yield json.loads(line)

def q1_memory(file_path: str) -> List[Tuple[datetime.date, str, int]]:
    tweet_counts_per_date = Counter()
    user_tweets_per_date = defaultdict(lambda: Counter())

    # Step 1 & 2: Read and process each tweet
    for tweet in read_tweets(file_path):
        date = datetime.strptime(tweet['date'], '%Y-%m-%dT%H:%M:%S%z').date()
        user = tweet['user']['username']
        tweet_counts_per_date[date] += 1
        user_tweets_per_date[date][user] += 1

    # Step 3 & 4: Identify top 10 dates
    top_10_dates = [date for date, _ in tweet_counts_per_date.most_common(10)]

    # Step 5: For each top date, find the user with the most tweets
    top_users = [(date, user_tweets.most_common(1)[0][0], user_tweets.most_common(1)[0][1]) for date, user_tweets in user_tweets_per_date.items() if date in top_10_dates]

    # Sort the result by date as the final step
    top_users_sorted = sorted(top_users, key=lambda x: x[0])

    return top_users_sorted
    


In [50]:
print(q1_memory(file_path))


[(datetime.date(2021, 2, 12), 'RanbirS00614606', 176), (datetime.date(2021, 2, 13), 'MaanDee08215437', 178), (datetime.date(2021, 2, 14), 'rebelpacifist', 119), (datetime.date(2021, 2, 15), 'jot__b', 134), (datetime.date(2021, 2, 16), 'jot__b', 133), (datetime.date(2021, 2, 17), 'RaaJVinderkaur', 185), (datetime.date(2021, 2, 18), 'neetuanjle_nitu', 195), (datetime.date(2021, 2, 19), 'Preetm91', 267), (datetime.date(2021, 2, 20), 'MangalJ23056160', 108), (datetime.date(2021, 2, 23), 'Surrypuria', 135)]


In [58]:
from src.q1_memory import q1_memory


In [59]:
result_q1_memory = q1_memory(file_path)
result_q1_memory

In [54]:
%load_ext memory_profiler

In [56]:
%timeit q1_memory.q1_memory(file_path)

75.2 ns ± 0.709 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)


In [49]:
%memit q1_memory(file_path)

peak memory: 461.33 MiB, increment: 1.70 MiB


In [70]:
from typing import List, Tuple
from datetime import datetime
import pandas as pd
import json

def q1_time(file_path: str) -> List[Tuple[datetime.date, str, int]]:
    try:
        with open(file_path, 'r') as json_file:
            # Cargar archivo json linea a linea
            data = [json.loads(line.strip()) for line in json_file]
        # Usar una list comprehension para extraer los campos
        data = [(item['date'], item['user']['username'], item['id']) for item in data if 'date' in item and 'user' in item and 'id' in item and 'username' in item['user']]

        # Convertir la lista de tuplas en un dataframe de pandas
        df = pd.DataFrame(data, columns=['date', 'user', 'id'])
        
        # Convertir campo date en formato datetime
        df['date'] = pd.to_datetime(df['date']).dt.date

        tweet_counts = df.groupby('date').size()
        top_10_dates = tweet_counts.nlargest(10).index
        df_top_10 = df[df['date'].isin(top_10_dates)]
        top_users = df_top_10.groupby('date')['user'].agg(lambda x: x.value_counts().index[0])

        # Convertir el resultado en una lista de tuplas
        result = [(date, user) for date, user in zip(top_10_dates, top_users)]
        
        return result
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return []
    except json.JSONDecodeError:
        print("Error decoding JSON.")
        return []
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

In [71]:

result_q1_fast = q1_time(file_path)
print(result_q1_fast)

[(datetime.date(2021, 2, 12), 'RanbirS00614606'), (datetime.date(2021, 2, 13), 'MaanDee08215437'), (datetime.date(2021, 2, 17), 'rebelpacifist'), (datetime.date(2021, 2, 16), 'jot__b'), (datetime.date(2021, 2, 14), 'jot__b'), (datetime.date(2021, 2, 18), 'RaaJVinderkaur'), (datetime.date(2021, 2, 15), 'neetuanjle_nitu'), (datetime.date(2021, 2, 20), 'Preetm91'), (datetime.date(2021, 2, 23), 'MangalJ23056160'), (datetime.date(2021, 2, 19), 'Surrypuria')]


In [56]:
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [72]:
%timeit q1_time(file_path)

4.76 s ± 199 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [73]:
!pip install pyspark

Collecting pyspark
  Using cached pyspark-3.5.1-py2.py3-none-any.whl
Collecting py4j==0.10.9.7
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.7 pyspark-3.5.1



[notice] A new release of pip available: 22.2.2 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [74]:
from typing import List, Tuple
from datetime import date
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, count, row_number
from pyspark.sql.window import Window

def q1_time_spark(file_path: str) -> List[Tuple[date, str, int]]:
    try:
      # Initialize Spark session
      spark = SparkSession.builder.appName("Q1Time").getOrCreate()

      # Read JSON file
      df = spark.read.json(file_path)

      # Filter out rows with missing fields
      df_filtered = df.filter("date IS NOT NULL AND user IS NOT NULL AND id IS NOT NULL AND user.username IS NOT NULL")

      # Select and rename fields
      df_selected = df_filtered.select(to_date(col("date")).alias("date"), col("user.username").alias("user"), col("id"))

      # Count tweets per date and get top 10 dates
      top_dates_df = df_selected.groupBy("date").count().orderBy(col("count").desc()).limit(10)

      # Join back to get tweets from top 10 dates only
      df_top_dates = df_selected.join(top_dates_df.select("date"), "date")

      # Get top user per date
      windowSpec = Window.partitionBy("date").orderBy(col("count").desc())
      top_users_df = df_top_dates.groupBy("date", "user").agg(count("id").alias("count")).withColumn("row_number", row_number().over(windowSpec)).filter(col("row_number") == 1).select("date", "user")

      # Collect result to driver
      result = [(row.date, row.user) for row in top_users_df.collect()]

      spark.stop()
      return result
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

In [75]:
result_q1_spark = q1_time_spark(file_path)
result_q1_spark

[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 14), 'rebelpacifist'),
 (datetime.date(2021, 2, 15), 'jot__b'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 17), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 18), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 19), 'Preetm91'),
 (datetime.date(2021, 2, 20), 'MangalJ23056160'),
 (datetime.date(2021, 2, 23), 'Surrypuria')]

In [76]:
%timeit q1_time_spark(file_path)

2.97 s ± 364 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [1]:
from collections import Counter
from typing import List, Tuple
import json
import emoji

def q2_memory(file_path: str) -> List[Tuple[str, int]]:
    emoji_counter = Counter()
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            if 'content' in data:
                # Use a generator expression for memory efficiency
                emojis = (value.chars for value in emoji.analyze(data['content']))
                emoji_counter.update(emojis)
    top_10_emojis = emoji_counter.most_common(10)
    return top_10_emojis

In [13]:
file_path = r"\Users\DZWorld2\Documents\Studying\challenge_option\tweets.json\farmers-protest-tweets-2021-2-4.json"

In [3]:
result_q2_memory = q2_memory(file_path)
result_q2_memory

[('🙏', 5049),
 ('😂', 3072),
 ('🚜', 2972),
 ('🌾', 2182),
 ('🇮🇳', 2086),
 ('🤣', 1668),
 ('✊', 1651),
 ('❤️', 1382),
 ('🙏🏻', 1317),
 ('💚', 1040)]

In [15]:
import pandas as pd
import emoji

# Function to extract emojis from a text
def extract_emojis(text):
    # List to store found emojis
    found_emojis = []
    # Iterate over each character in the text
    for char in text:
        # Check if the character is an emoji
        if emoji.is_emoji(char):
            found_emojis.append(char)
    return found_emojis

# Main function to process the file and get top 10 emojis
def q2_pandas(file_path):
    # Read the file into a DataFrame
    df = pd.read_json(file_path, lines=True)
    
    # Filter rows to ensure 'content' is not null
    df = df[df['content'].notnull()]
    
    # Extract emojis from 'content'
    df['emojis'] = df['content'].apply(extract_emojis)
    
    # Flatten the list of emojis and create a Series
    all_emojis = pd.Series([emoji for sublist in df['emojis'] for emoji in sublist])
    
    # Count occurrences and get the top 10 emojis
    top_10_emojis = all_emojis.value_counts().head(10).reset_index()
    top_10_emojis.columns = ['emoji', 'count']
    
    return top_10_emojis.to_dict('records')

AttributeError: module 'emoji' has no attribute 'get_emoji_regexp'