# convert_txt_to_csv

In [1]:
import csv
import re
from datetime import datetime

def convert_txt_to_csv(input_file, output_file):
    month_mapping = {
        "January": "Jan", "February": "Feb", "March": "Mar", "April": "Apr", "May": "May", "June": "Jun", 
        "July": "Jul", "August": "Aug", "September": "Sep", "October": "Oct", "November": "Nov", "December": "Dec"
    }
    
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["Date", "Value1", "Value2", "Value3"])  # Writing header
        
        for line in infile:
            line = line.strip()
            if not line:
                continue  # Skip empty lines
            
            parts = re.split(r'\t+', line)  # Split using tab(s)
            date_str = parts[0]
            values = parts[1:]
            
            # Normalize month format
            for full, short in month_mapping.items():
                if full in date_str:
                    date_str = date_str.replace(full, short)
                    break
            
            # Handle different date formats
            try:
                date_obj = datetime.strptime(date_str, "%b %d, %Y")
            except ValueError:
                date_obj = datetime.strptime(date_str, "%b %d %Y")
            
            formatted_date = date_obj.strftime("%d-%m-%Y")
            
            writer.writerow([formatted_date] + values)

# Usage
convert_txt_to_csv("data.txt", "data.csv")

# clean_cs

In [2]:
import csv
from dateutil import parser

def clean_csv(input_file, output_file):
    with open(input_file, 'r', newline='') as infile, open(output_file, 'w', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        header = next(reader)  # Read and write the header
        writer.writerow(header)

        for row in reader:
            # Check if all values (except date) are "-" or "–"
            if all(val.strip() in ["-", "–"] for val in row[1:]):
                continue  # Skip this row
            
            # Format values to ensure single digits have a leading zero
            formatted_row = [row[0]]  # Keep the date as is
            for value in row[1:]:
                parts = value.split('-')
                formatted_value = "-".join(f"{int(part):02d}" for part in parts)  # Ensure 2-digit format
                formatted_row.append(formatted_value)
            
            writer.writerow(formatted_row)

# Usage
clean_csv("data.csv", "cleaned_data.csv")

# analyze_all_time

In [3]:
import pandas as pd
from collections import Counter
import random

# Load the CSV file
df = pd.read_csv("cleaned_data.csv")

# Flatten the values (excluding the Date column)
all_numbers = []
for col in df.columns[1:]:  # Skip the Date column
    for values in df[col].astype(str).str.split('-'):
        all_numbers.extend(values)

# Count occurrences of each number
counter = Counter(all_numbers)
total_occurrences = sum(counter.values())

# Calculate percentage appearance
percentage_dict = {num: (count / total_occurrences) * 100 for num, count in counter.items()}

# Sort by percentage (highest to lowest)
sorted_percentages = sorted(percentage_dict.items(), key=lambda x: x[1], reverse=True)

# Predict the next likely numbers based on highest probabilities
predicted_numbers = [num for num, _ in sorted_percentages[:5]]  # Top 5 predictions

# Display results
print("Predicted next numbers:", predicted_numbers)

# Show only the top 10 numbers
print("Top 10 most frequent numbers of all time:")
for num, percentage in sorted_percentages[:10]:
    print(f"{num}: {percentage:.2f}%")

Predicted next numbers: ['12', '09', '16', '21', '07']
Top 10 most frequent numbers of all time:
12: 3.68%
09: 3.62%
16: 3.62%
21: 3.60%
07: 3.53%
03: 3.51%
19: 3.45%
25: 3.43%
20: 3.43%
31: 3.41%


# analyze_last_90_days

In [6]:
import pandas as pd
from collections import Counter
from datetime import datetime, timedelta

days = 90

# Load the CSV file
df = pd.read_csv("cleaned_data.csv")

# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# Get the current date (or the latest date in the dataset)
current_date = df['Date'].max()

# Calculate the date 3 months ago
three_months_ago = current_date - timedelta(days)

# Filter the dataset to include only the last 3 months
filtered_df = df[df['Date'] >= three_months_ago]

# Combine all values into a single list
all_values = filtered_df['Value1'].tolist() + filtered_df['Value2'].tolist() + filtered_df['Value3'].tolist()

# Split the values into individual numbers
all_numbers = []
for value in all_values:
    all_numbers.extend(value.split('-'))

# Convert the numbers to integers
all_numbers = [int(num) for num in all_numbers]

# Count the frequency of each number
number_frequency = Counter(all_numbers)

# Sort the numbers by frequency in descending order
sorted_numbers = sorted(number_frequency.items(), key=lambda x: x[1], reverse=True)

# Predict the top N numbers with the highest probability
top_n = 5  # Change this value to get more or fewer predictions
predicted_numbers = [num for num, freq in sorted_numbers[:top_n]]

print(f"Top {top_n} numbers with the highest probability to be picked next (last {days} days): {predicted_numbers}")

Top 5 numbers with the highest probability to be picked next (last 90 days): [7, 25, 29, 9, 1]
