In [None]:
from pathlib import Path
import pandas as pd
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Create a file path
filepath = "../Resources/data.csv"
# Read in the data.
df = pd.read_csv(filepath)
df.head()

In [None]:
#drop Date Posted column
df.drop(columns=["Date Posted"], inplace=True)

In [None]:
df.head()

In [None]:
#Drop Price EUR column
df.drop(columns=["Price EUR"], inplace=True)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
#separte artifacts from lands
df.loc[df["Type Line"].str.contains("Land"), "Colours"] = "['L']"

#remove null values from Lands
df.loc[df["Mana Cost"].isna(), "Mana Cost"] = "['L']"

df.head() #final clean version of the data set

In [None]:
#This will drop null values from the Price USD column 
price_drop_df = df.dropna(subset=['Price USD'])

In [None]:
price_drop_df #the dataframe for the linregress

In [None]:
# count number of colors/color combinations and the number of each occurrance
colours = df['Colours'].value_counts()
print(colours)
print(type(colours))
# 23 variations as series. I don't see any null values or bad data.

In [None]:
# Update 'Colours' column based on the condition 'Type Line' contains 'Land'
# This will differentiate between Lands and Artifacts, which are both colorless
df.loc[df["Type Line"].str.contains("Land"), "Colours"] = "['L']"
print(colours)

In [None]:
df.info()

In [None]:
price_drop_df["Printing Dates"]=pd.to_datetime(df["Most Recent Printing"])

In [None]:
price_drop_df.info()

In [None]:
today = pd.Timestamp("2024-06-03")

In [None]:
price_drop_df["Printing Age"] = (today-price_drop_df["Printing Dates"]).dt.days
price_drop_df.head()

In [None]:
df.info()

In [None]:
price_drop_df['Mana Value'] = df['Mana Value'].astype(int)

In [None]:
price_drop_df.head()

In [None]:
type_count = df['Type Line'].value_counts()
type_count

In [None]:
#Land count
keyword = 'Land'
count_with_land = df['Type Line'].str.contains(keyword).sum()
count_with_land

#Sorcery
#keyword = 'Sorcery'
#count_with_sorcery = df['Type Line'].str.contains(keyword).sum()
#count_with_sorcery

In [None]:
#Sorcery
keyword = 'Sorcery'
count_with_sorcery = df['Type Line'].str.contains(keyword).sum()
count_with_sorcery


In [None]:
keyword = 'Artifact'
count_with_artifact = df['Type Line'].str.contains(keyword).sum()
count_with_artifact


In [None]:
keyword = 'Creature'
count_with_creature = df['Type Line'].str.contains(keyword).sum()
count_with_creature

In [None]:
#Variables
X = price_drop_df[['Printing Age']]  # Features (age column)
y = price_drop_df['Price USD']  # Target variable (price column)

# Fit the linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(X.values.flatten(), y.values)

# Create the linear regression line
predicted_prices = slope * X.values.flatten() + intercept

# Plot the original data and the linear regression line
plt.scatter(X, y, color='purple', label='Actual Data', edgecolors='black')
plt.plot(X, predicted_prices, color='red', label='Linear Regression')
plt.xlabel('Printing Age')
plt.ylabel('Price USD')
plt.title(f'Linear Regression: Price USD vs Printing Age\nR value: {r_value:.2f}')
plt.legend()
#plt.show()
plt.savefig('linear_regression_plot.svg', format='svg')

In [None]:
#Further linregress breakdown
X = price_drop_df[['Printing Age']]  # Features (age column)
y = price_drop_df['Price USD']  # Target variable (price column)

# Fit the linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(X.values.flatten(), y.values)

# Create the linear regression line
predicted_prices = slope * X.values.flatten() + intercept

# Specify the value of 'Rarity'
specific_Rarity = 'rare'

# Filter the DataFrame for the specific Rarity
subset_df = price_drop_df[price_drop_df['Rarity'] == specific_Rarity]

# Plot the original data and the linear regression line for the specific Rarity
plt.figure(figsize=(8, 6))
plt.scatter(subset_df['Printing Age'], subset_df['Price USD'], marker='o', label='Type Line: ' + specific_Rarity, edgecolors='black')

plt.plot(X, predicted_prices, color='red', label='Linear Regression')
plt.xlabel('Printing Age')
plt.ylabel('Price USD')
plt.title('Linear Regression: Price USD vs Printing Age for Type Line: ' + specific_Rarity)
plt.legend()
plt.text(0.95, 0.5, f'R value: {r_value:.2f}', transform=plt.gca().transAxes, fontsize=10, verticalalignment='center', horizontalalignment='right')
#plt.show()
plt.savefig('linear_regression_plot_sub1.svg', format='svg')

In [None]:
X = price_drop_df[['Printing Age']]  # Features (age column)
y = price_drop_df['Price USD']  # Target variable (price column)

# Fit the linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(X.values.flatten(), y.values)

# Create the linear regression line
predicted_prices = slope * X.values.flatten() + intercept

# Specify the value of 'Rarity'
specific_Rarity = 'mythic'

# Filter the DataFrame for the specific Rarity
subset_df = price_drop_df[price_drop_df['Rarity'] == specific_Rarity]

# Plot the original data and the linear regression line for the specific Rarity
plt.figure(figsize=(8, 6))
plt.scatter(subset_df['Printing Age'], subset_df['Price USD'], marker='o', label='Type Line: ' + specific_Rarity, edgecolors='black')

plt.plot(X, predicted_prices, color='red', label='Linear Regression')
plt.xlabel('Printing Age')
plt.ylabel('Price USD')
plt.title('Linear Regression: Price USD vs Printing Age for Type Line: ' + specific_Rarity)
plt.legend()
plt.text(0.95, 0.5, f'R value: {r_value:.2f}', transform=plt.gca().transAxes, fontsize=10, verticalalignment='center', horizontalalignment='right')
#plt.show()
plt.savefig('linear_regression_plot_sub2.svg', format='svg')

In [None]:
X = price_drop_df[['Printing Age']]  # Features (age column)
y = price_drop_df['Price USD']  # Target variable (price column)

# Fit the linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(X.values.flatten(), y.values)

# Create the linear regression line
predicted_prices = slope * X.values.flatten() + intercept

# Specify the value of 'Rarity'
specific_Rarity = 'common'

# Filter the DataFrame for the specific Rarity
subset_df = price_drop_df[price_drop_df['Rarity'] == specific_Rarity]

# Plot the original data and the linear regression line for the specific Rarity
plt.figure(figsize=(8, 6))
plt.scatter(subset_df['Printing Age'], subset_df['Price USD'], marker='o', label='Type Line: ' + specific_Rarity, edgecolors='black')

plt.plot(X, predicted_prices, color='red', label='Linear Regression')
plt.xlabel('Printing Age')
plt.ylabel('Price USD')
plt.title('Linear Regression: Price USD vs Printing Age for Type Line: ' + specific_Rarity)
plt.legend()
plt.text(0.95, 0.5, f'R value: {r_value:.2f}', transform=plt.gca().transAxes, fontsize=10, verticalalignment='center', horizontalalignment='right')
#plt.show()
plt.savefig('linear_regression_plot_sub3.svg', format='svg')

In [None]:
X = price_drop_df[['Printing Age']]  # Features (age column)
y = price_drop_df['Price USD']  # Target variable (price column)

# Fit the linear regression model
slope, intercept, r_value, p_value, std_err = stats.linregress(X.values.flatten(), y.values)

# Create the linear regression line
predicted_prices = slope * X.values.flatten() + intercept

# Specify the value of 'Rarity'
specific_Rarity = 'uncommon'

# Filter the DataFrame for the specific Rarity
subset_df = price_drop_df[price_drop_df['Rarity'] == specific_Rarity]

# Plot the original data and the linear regression line for the specific Rarity
plt.figure(figsize=(8, 6))
plt.scatter(subset_df['Printing Age'], subset_df['Price USD'], marker='o', label='Type Line: ' + specific_Rarity, edgecolors='black')
plt.plot(X, predicted_prices, color='red', label='Linear Regression')
plt.xlabel('Printing Age')
plt.ylabel('Price USD')
plt.title('Linear Regression: Price USD vs Printing Age for Type Line: ' + specific_Rarity)
plt.legend()
plt.text(0.95, 0.5, f'R value: {r_value:.2f}', transform=plt.gca().transAxes, fontsize=10, verticalalignment='center', horizontalalignment='right')
#plt.show()
plt.savefig('linear_regression_plot_sub4.svg', format='svg')

In [None]:
custom_palette = [ "#1BC640", "#1C91C6", "#CB3D36", "#9432B0"] 

In [None]:
rarity_count = df['Rarity'].value_counts()
rarity_count

In [None]:
total_cards = len(df)

# Calculate rarity counts
rarity_count = df['Rarity'].value_counts()

# Calculate percentages
rarity_percentages = (rarity_count / total_cards) * 100

#custom colors
sns.set_palette(custom_palette)

# Plot pie chart
plt.figure(figsize=(8, 6))
plt.pie(rarity_percentages, labels=rarity_percentages.index, autopct='%1.1f%%', startangle=140,)
plt.title('Rarity Distribution')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
#plt.show()
plt.savefig('pie_plot.svg', format='svg')

In [None]:
rarity_count = df['Rarity'].value_counts()

# Calculate percentages
rarity_percentages = (rarity_count / total_cards) * 100

# Create a matrix from the rarity percentages
rarity_matrix = rarity_percentages.values.reshape(1, -1)

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(rarity_matrix, cmap='viridis', annot=True, fmt='.1f', cbar=False)
plt.title('Rarity Distribution')
plt.xlabel('Rarity')
plt.ylabel('Percentage')
plt.xticks(ticks=range(len(rarity_percentages)), labels=rarity_percentages.index)
plt.yticks([])  # Remove y-axis ticks
plt.tight_layout()

# Save the plot
plt.savefig('heatmap_rarity_distribution.svg', format='svg')
plt.show()

In [None]:
#rarity brea
rarity_breakdown = df.groupby('Rarity')['Card'].apply(list)
rarity_breakdown

In [None]:
#for comparing main board to sideboard
main_sideboard = df.groupby('Main/Sideboard')['Card'].apply(list)
main_sideboard

In [None]:
#average converted mana cost of cards in Archetype
average_archetype = df.groupby('Archetype')['Mana Value'].mean()
average_archetype

In [None]:
average_archetype = average_archetype.sort_values(ascending=False)

archetype_color = ["#9432B0"]
sns.set_palette(archetype_color)

average_archetype.plot(kind='bar', figsize=(8, 6))
plt.xlabel('Archetype')
plt.ylabel('Average Mana Value')
plt.title('Average Value by Archetype')
#plt.show()
plt.savefig('bar_plot.svg', format='svg')

In [None]:
selected_archetypes = ['Enigmatic Fires', 'Izzet Creativity', 'Izzet Phoenix', 'Atarka Red', 'Azorius Spirits', 'Rakdos Sacrifice']
filtered_df = df[df['Archetype'].isin(selected_archetypes)]

plt.figure(figsize=(10, 6))
sns.violinplot(x='Archetype', y='Mana Value', data=df, order=filtered_df.groupby('Archetype')['Mana Value'].mean().sort_values(ascending=False).index)
plt.xlabel('Archetype')
plt.ylabel('Mana Value')
plt.title('Violin Plot of Mana Value by Archetype (Selection)')
#plt.show()
plt.savefig('violin_plot.svg', format='svg')

In [None]:
selected_archetypes = ['Enigmatic Fires', 'Izzet Creativity', 'Izzet Phoenix', 'Atarka Red', 'Azorius Spirits', 'Rakdos Sacrifice']

# Filter the DataFrame for the selected archetypes
filtered_df = df[df['Archetype'].isin(selected_archetypes)]

plt.figure(figsize=(10, 6))

# Create a box plot using Seaborn
sns.boxplot(x='Archetype', y='Mana Value', data=filtered_df, order=selected_archetypes)
plt.xlabel('Archetype')
plt.ylabel('Mana Value')
plt.title('Box Plot of Mana Value by Archetype (Selection)')
plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility
plt.tight_layout()

# Save the plot
plt.savefig('box_plot_selected_archetypes.svg', format='svg')
#plt.show()


In [None]:
plt.figure(figsize=(30, 6))
sns.scatterplot(data=df, x='Archetype', y='Mana Value', hue='Archetype', hue_order=average_archetype.index, s=100)
plt.xlabel('Archetype')
plt.ylabel('Mana Value')
plt.title('Scatter Plot of Mana Value by Archetype (Descending Order)')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Archetype', y='Mana Value', size='Archetype', sizes=[200, 200, 200], hue_order=average_archetype.index)
plt.xlabel('Archetype')
plt.ylabel('Mana Value')
plt.title('Bubble Plot of Mana Value by Archetype')
plt.show()

In [None]:
price_data = df['Price USD']
quantity_data = df['Quantity']

# Create a heatmap using Seaborn's kdeplot
plt.figure(figsize=(10, 8))
sns.kdeplot(x=price_data, y=quantity_data, cmap='viridis', cbar=True, fill=True)
plt.xlabel('Price')
plt.ylabel('Quantity')
plt.title('Heatmap of Price vs Quantity')
plt.show()

In [None]:
df.corr(numeric_only=True)

In [None]:
plt.figure(figsize=(10, 10))
colours = df['Colours'].value_counts()
total_cards = len(df)
percentage_used = (colours / total_cards) * 100
data = pd.DataFrame({'Color': colours.index, 'Percentage Used': percentage_used.values})
sns.boxplot(x='Color', y='Percentage Used', data=data)  # Corrected variable names
plt.title('Box Plot of Percentage of Cards Used by Color')
plt.xlabel('Color')
plt.ylabel('Percentage of Cards Used')
plt.show()

In [None]:
plt.figure(figsize=(40, 5))

colours = df['Colours'].value_counts()
total_cards = len(df)
percentage_used = (colours / total_cards) * 100
data = pd.DataFrame({'Color': colours.index, 'percentage_used': percentage_used.values})

# Create the box plot
barplot = sns.barplot(x='Color', y='percentage_used', data=data)

# Adjust the font size of the x-axis labels
barplot.set_xlabel('Color', fontsize=30)  # Set the desired font size

# Change the x-axis tick label font size
barplot.set_xticklabels(barplot.get_xticklabels(), fontsize=10)  # Set the desired font size for x-axis tick labels
barplot.set_yticklabels(barplot.get_yticklabels(), fontsize=14)  # Set the desired font size for x-axis tick labels
plt.title('Bar Graph of Percentage of Cards Used by Color', fontsize=30)
plt.ylabel('Percentage of Cards Used', fontsize=25)


plt.savefig('linear_regression_plot.svg', format='svg')

In [None]:
# Seven Create a donut chart
plt.pie(colours, labels=colours.index, autopct='%1.1f%%', startangle=90, wedgeprops=dict(width=0.4), colors=custom_palette)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
# Add a circle at the center to create a donut chart
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.title('Donut Chart of Color Distribution')
#plt.show()
plt.savefig('donut_chart_colors.svg', format='svg')

In [None]:
# Seven Create a horizontal bar chart
plt.figure(figsize=(10, 6))  # Set the figure size
plt.barh(colours.index, colours.values, color='skyblue')
plt.xlabel('Count')
plt.ylabel('Colors')
plt.title('Horizontal Bar Chart of Color Distribution')
#plt.show()
plt.savefig('horizontal_bar_color_chart.svg', format='svg')

In [None]:
cards = df['Card'].unique()
cards_list = cards.tolist()

In [None]:
# Laura Sample dataframe with 'Card' column
df = pd.DataFrame({'Card': cards_list})

# Tokenize and count word frequencies
keyword_frequency = {}

# List of common stopwords to exclude
stopwords = ['the', 'a', 'an', 'and', ',', ':', ';', '.', '!', '?']  

# Iterate through each card name
for card in df['Card']:
    # Extract keywords using TextBlob
    blob = TextBlob(card)
    keywords = [word.lower() for word in blob.words if word.lower() not in stopwords and len(word) > 2]  # Exclude short words
    # Update frequency count for each keyword
    for keyword in keywords:
        if keyword in keyword_frequency:
            keyword_frequency[keyword] += 1
        else:
            keyword_frequency[keyword] = 1

# Sort the keyword frequency dictionary by frequency in descending order
sorted_keyword_frequency = dict(sorted(keyword_frequency.items(), key=lambda item: item[1], reverse=True))

# Print the sorted keyword frequency
for word, frequency in sorted_keyword_frequency.items():
    print(f"{word}: {frequency}")

In [None]:
# Laura Keyword Frequency Heatmap 

# Sort the DataFrame by frequency in descending order
df_heatmap_sorted = df_heatmap.sort_values(by='Frequency', ascending=False)
# Create a DataFrame with keyword frequencies (words as index, frequencies as values)
heatmap_data = df_heatmap_sorted.pivot(index='Frequency', columns='Word', values='Frequency').fillna(0)
# Create a heatmap
fig = px.imshow(heatmap_data, labels=dict(color='Frequency'), x=heatmap_data.columns, y=heatmap_data.index)
# Update layout
fig.update_layout(
    title="Keyword Frequency Heatmap",
    xaxis_title="Word",
    yaxis_title="Frequency"
)
# Ensure all frequency values are included in the y-axis ticks
fig.update_yaxes(tickmode='array', tickvals=df_heatmap_sorted['Frequency'])
# Show the plot
fig.show()