# PYTHON BASICS FOR DATA SCIENCE

# 1. DATA TYPES & VARIABLES

In [1]:
# Numeric data types
age = 25                  # Integer - whole numbers
height = 5.9              # Float - decimal numbers
complex_number = 3 + 2j   # Complex - for mathematical operations

# Text data
name = "Data Scientist"   # String - for text values

# Boolean values
is_valid = True           # Boolean - True/False values

# Print variable types
print(f"Age is {type(age)}")
print(f"Name is {type(name)}")
print(f"Height is {type(height)}")

Age is <class 'int'>
Name is <class 'str'>
Height is <class 'float'>


# 2. DATA STRUCTURES

In [2]:
# Lists - ordered, mutable collections (can be changed after creation)
scores = [85, 92, 78, 90, 88]
scores.append(95)         # Add a new score
scores[0] = 87            # Modify the first score
print(f"Updated scores: {scores}")
print(f"Average score: {sum(scores)/len(scores)}")

Updated scores: [87, 92, 78, 90, 88, 95]
Average score: 88.33333333333333


In [3]:
# Dictionaries - key-value pairs for named data
student = {
    'name': 'Alex',
    'age': 22,
    'courses': ['Data Science 101', 'Python Programming', 'Statistics']
}
# Access values by their keys
print(f"Student name: {student['name']}")
print(f"Student is taking {len(student['courses'])} courses")

Student name: Alex
Student is taking 3 courses


In [4]:
# Tuples - ordered, immutable collections (cannot be changed after creation)
dimensions = (1920, 1080)  # Screen resolution
# dimensions[0] = 1280    # This would cause an error - tuples cannot be modified


In [5]:
# Sets - unordered collections of unique elements
unique_visitors = {'user123', 'user456', 'user789', 'user123'}
print(f"Unique visitors: {unique_visitors}")  # Note: Duplicates are automatically removed

Unique visitors: {'user456', 'user123', 'user789'}


# 3. CONTROL FLOW & LOGIC

In [6]:
# Conditional statements for decision-making
def analyze_performance(score):
    """Categorize performance based on score"""
    if score >= 90:
        return "Excellent"
    elif score >= 80:
        return "Good"
    elif score >= 70:
        return "Average"
    else:
        return "Needs improvement"

# Example use
student_score = 85
print(f"Performance: {analyze_performance(student_score)}")

Performance: Good


In [7]:
# Loops for processing collections of data
scores = [85, 92, 78, 90, 88]
print("Individual analyses:")
for score in scores:
    print(f"Score {score}: {analyze_performance(score)}")

Individual analyses:
Score 85: Good
Score 92: Excellent
Score 78: Average
Score 90: Excellent
Score 88: Good


In [8]:
# Counting scores by category
categories = {"Excellent": 0, "Good": 0, "Average": 0, "Needs improvement": 0}
for score in scores:
    result = analyze_performance(score)
    categories[result] += 1
    
print("Performance summary:")
for category, count in categories.items():
    print(f"{category}: {count} students")

Performance summary:
Excellent: 2 students
Good: 2 students
Average: 1 students
Needs improvement: 0 students


# 4. LIST COMPREHENSIONS

In [9]:
# Traditional way to square numbers
numbers = [1, 2, 3, 4, 5]
squares = []
for num in numbers:
    squares.append(num ** 2)
print(f"Squares (traditional): {squares}")

Squares (traditional): [1, 4, 9, 16, 25]


In [10]:
# Using list comprehension - much more concise
squares = [num ** 2 for num in numbers]
print(f"Squares (comprehension): {squares}")

Squares (comprehension): [1, 4, 9, 16, 25]


In [13]:
# Filtering with list comprehension
even_squares = [num ** 2 for num in numbers if num % 2 == 0]
print(f"Even squares only: {even_squares}")

Even squares only: [4, 16]


In [14]:
# Processing strings
names = ["Alice", "Bob", "Charlie", "David"]
name_lengths = [len(name) for name in names]
print(f"Name lengths: {name_lengths}")

Name lengths: [5, 3, 7, 5]


In [15]:
# More complex example - extracting data
data = [
    {"name": "Alice", "age": 25, "score": 92},
    {"name": "Bob", "age": 27, "score": 85},
    {"name": "Charlie", "age": 22, "score": 78}
]
# Extract names of people with scores over 80
high_scorers = [person["name"] for person in data if person["score"] > 80]
print(f"High scorers: {high_scorers}")

High scorers: ['Alice', 'Bob']


# 5. FUNCTIONS

In [16]:
def calculate_statistics(numbers):
    """
    Calculate basic statistics for a list of numbers.
    
    Args:
        numbers: List of numeric values
        
    Returns:
        Dictionary containing mean, median, range, and count
    """
    # Handle empty list case
    if not numbers:
        return {"error": "Empty list provided"}
        
    # Sort the list for median calculation
    sorted_nums = sorted(numbers)
    n = len(sorted_nums)
    
    # Calculate median properly for odd or even number of elements
    if n % 2 == 0:
        median = (sorted_nums[n//2 - 1] + sorted_nums[n//2]) / 2
    else:
        median = sorted_nums[n//2]
    
    # Calculate other statistics
    mean = sum(numbers) / n
    data_range = max(numbers) - min(numbers)
    
    return {
        'mean': mean,
        'median': median,
        'range': data_range,
        'count': n,
        'min': min(numbers),
        'max': max(numbers)
    }

# Using our function
data_sample = [12, 18, 9, 15, 11, 20, 14, 21, 13]
stats = calculate_statistics(data_sample)

print("Statistics for our data sample:")
for key, value in stats.items():
    print(f"{key.capitalize()}: {value}")

Statistics for our data sample:
Mean: 14.777777777777779
Median: 14
Range: 12
Count: 9
Min: 9
Max: 21


In [17]:
# Using the function again with different data
another_sample = [92, 85, 78, 90, 88]
print("\nStatistics for test scores:")
for key, value in calculate_statistics(another_sample).items():
    print(f"{key.capitalize()}: {value}")


Statistics for test scores:
Mean: 86.6
Median: 88
Range: 14
Count: 5
Min: 78
Max: 92


# 6. Working with Files

In [18]:
import csv
import os

# First, let's create some sample data
def create_sample_csv():
    """Create a sample CSV file with student data"""
    with open('student_data.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Name', 'Age', 'Score'])  # Header row
        writer.writerow(['Alice', 24, 92])
        writer.writerow(['Bob', 27, 85])
        writer.writerow(['Charlie', 22, 78])
        writer.writerow(['Diana', 25, 95])
        writer.writerow(['Evan', 23, 88])
    
    print("Sample CSV file created: student_data.csv")

# Create our sample file
create_sample_csv()

Sample CSV file created: student_data.csv


In [19]:
# Now, let's read and analyze the data
def analyze_csv_data(filename):
    """Read and analyze data from a CSV file"""
    if not os.path.exists(filename):
        return {"error": f"File {filename} not found"}
    
    # Initialize storage for our data
    ages = []
    scores = []
    
    # Read the file
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip and store the header row
        
        print(f"Analyzing {filename}...")
        print(f"Columns: {header}")
        
        # Process each data row
        for row in reader:
            name, age, score = row
            ages.append(int(age))
            scores.append(int(score))
            print(f"{name}, aged {age}, scored {score}")
    
    # Return analysis results
    return {
        "count": len(ages),
        "avg_age": sum(ages) / len(ages),
        "avg_score": sum(scores) / len(scores),
        "min_score": min(scores),
        "max_score": max(scores)
    }

# Analyze our sample file
results = analyze_csv_data('student_data.csv')

Analyzing student_data.csv...
Columns: ['Name', 'Age', 'Score']
Alice, aged 24, scored 92
Bob, aged 27, scored 85
Charlie, aged 22, scored 78
Diana, aged 25, scored 95
Evan, aged 23, scored 88


In [20]:
print("\nAnalysis results:")
for key, value in results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.2f}")
    else:
        print(f"{key}: {value}")


Analysis results:
count: 5
avg_age: 24.20
avg_score: 87.60
min_score: 78
max_score: 95


# 7. Error Handling

In [21]:
def safe_analyze_data(data):
    """Safely analyze a list of values with error handling."""
    results = {}
    
    # Check input type
    try:
        if not isinstance(data, list):
            raise TypeError("Input must be a list")
        
        # Check if list is empty
        if not data:
            raise ValueError("Cannot analyze empty list")
            
        # Try to convert all items to numbers
        numeric_data = []
        for item in data:
            numeric_data.append(float(item))
        
        # Calculate statistics
        results['mean'] = sum(numeric_data) / len(numeric_data)
        results['max'] = max(numeric_data)
        results['min'] = min(numeric_data)
        results['range'] = results['max'] - results['min']
        
        return results
        
    except TypeError as e:
        return {"error": f"Type error: {str(e)}"}
    except ValueError as e:
        return {"error": f"Value error: {str(e)}"}
    except Exception as e:
        return {"error": f"Unexpected error: {str(e)}"}

# Test with various inputs
print("Valid data:")
print(safe_analyze_data([10, 20, 30, 40, 50]))

print("\nEmpty list:")
print(safe_analyze_data([]))

print("\nMixed data types:")
print(safe_analyze_data([10, 20, "thirty", 40, 50]))

print("\nNot a list:")
print(safe_analyze_data("this is a string"))

Valid data:
{'mean': 30.0, 'max': 50.0, 'min': 10.0, 'range': 40.0}

Empty list:
{'error': 'Value error: Cannot analyze empty list'}

Mixed data types:
{'error': "Value error: could not convert string to float: 'thirty'"}

Not a list:
{'error': 'Type error: Input must be a list'}
